def sources(self, data, hostDict): sources = [] if not data: return sources try: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) url = '%s%s' % (self.base_link, self.search_link % quote_plus(query)) # log_utils.log('url = %s' % url, __name__, log_utils.LOGDEBUG) r = client.request(url, timeout='5') if not r or 'Error 404' in r: return sources r = client.parseDOM(r, 'div', attrs={'id': 'content'}) r1 = client.parseDOM(r, 'h2') posts = zip(client.parseDOM(r1, 'a', ret='href'), client.parseDOM(r1, 'a')) except: source_utils.scraper_error('MYVIDEOLINK') return sources items = [] for post in posts: try: name = source_utils.strip_non_ascii_and_unprintable(post[1]) if '<' in name: name = re.sub(r'<.*?>', '', name) name = client.replaceHTMLCodes(name) name = source_utils.clean_name(name) if 'tvshowtitle' in data: if not source_utils.check_title(title, aliases, name, hdlr, year): if not source_utils.check_title( title, aliases, name, 'S%02d' % int(data['season']), year): if not source_utils.check_title( title, aliases, name, 'Season.%d' % int(data['season']), year): if not source_utils.check_title( title, aliases, name, 'S%d' % int(data['season']), year): continue else: if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) link = post[0] results = client.request(link, timeout='5') results = client.parseDOM(results, 'div', attrs={'class': 'entry-content cf'})[0] if 'tvshowtitle' in data: isSeasonList = False if 'Season' in name or 'S%02d' % int( data['season']) in name: isSeasonList = True results = re.sub(r'\n', '', results) results = re.sub(r'\t', '', results).replace('> <', '><') test = re.findall( r'<p><b>(.*?)</ul>', results, re.DOTALL ) # parsing this site for episodes is a bitch, f**k it this is close as I'm doing for x in test: test2 = re.search(r'(.*?)</b>', x).group(1) if hdlr in test2: if isSeasonList: name = re.sub(r'\.Season\.\d+', '.%s.' % test2.replace(' ', '.'), name) name = re.sub(r'\.S\d+', '.%s' % test2.replace(' ', '.'), name) else: name = test2 links = client.parseDOM(x, 'a', ret='href') break else: try: test3 = re.search(r'<p><b>(.*?)</b></p>', x).group(1) except: continue if hdlr in test3: if isSeasonList: name = re.sub( r'\.Season\.\d+', '.%s.' % test3.replace(' ', '.'), name) name = re.sub( r'\.S\d+', '.%s' % test3.replace(' ', '.'), name) else: name = test3 links = client.parseDOM(x, 'a', ret='href') break else: links = client.parseDOM(results, 'a', attrs={'class': 'autohyperlink'}, ret='href') for link in links: try: url = py_tools.ensure_text(client.replaceHTMLCodes( str(link)), errors='replace') if url.endswith(('.rar', '.zip', '.iso', '.part', '.png', '.jpg', '.bmp', '.gif')): continue if url in str(sources): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name_info, url) try: size = re.search( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', results).group(0) dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'myvideolink', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('MYVIDEOLINK') except: source_utils.scraper_error('MYVIDEOLINK') return sources
def get_sources(self, name, url): try: # r = self.scraper.get(url, headers=self.headers).content r = py_tools.ensure_str(self.scraper.get( url, headers=self.headers).content, errors='replace') name = client.replaceHTMLCodes(name) if name.startswith('['): name = name.split(']')[1] name = name.strip().replace(' ', '.') name_info = source_utils.info_from_name(name, self.title, self.year, self.hdlr, self.episode_title) if source_utils.remove_lang(name_info): return self.sources l = dom_parser.parse_dom(r, 'pre', {'class': 'links'}) if l == []: return s = '' for i in l: s += i.content urls = re.findall( r'''((?:http|ftp|https)://[\w_-]+(?:(?:\.[\w_-]+)+)[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])''', i.content, flags=re.M | re.S) urls = [ i for i in urls if not i.endswith(('.rar', '.zip', '.iso', '.idx', '.sub', '.srt')) ] for link in urls: url = py_tools.ensure_text(client.replaceHTMLCodes(str(link)), errors='replace') if url in str(self.sources): continue valid, host = source_utils.is_host_valid(url, self.hostDict) if not valid: continue quality, info = source_utils.get_release_quality(name, url) try: size = re.search( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', name).group(0) dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) self.sources.append({ 'provider': 'rapidmoviez', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('RAPIDMOVIEZ')
def sources(self, url, hostDict): sources = [] if not url: return sources try: data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) r = client.request(url) if not r: return sources posts = client.parseDOM(r, 'h2') urls = [] for item in posts: if not item.startswith('<a href'): continue try: name = client.parseDOM(item, "a")[0] if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue quality, info = source_utils.get_release_quality( name_info, item[0]) try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', item)[0] dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) item = client.parseDOM(item, 'a', ret='href') url = item links = self.links(url) if links is None: continue urls += [(i, name, name_info, quality, info, dsize) for i in links] except: source_utils.scraper_error('300MBFILMS') for item in urls: if 'earn-money' in item[0]: continue url = py_tools.ensure_text(client.replaceHTMLCodes(item[0]), errors='replace') valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue sources.append({ 'provider': '300mbfilms', 'source': host, 'name': item[1], 'name_info': item[2], 'quality': item[3], 'language': 'en', 'url': url, 'info': item[4], 'direct': False, 'debridonly': True, 'size': item[5] }) return sources except: source_utils.scraper_error('300MBFILMS') return sources
def sources(self, url, hostDict): sources = [] if not url: return sources try: data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'].replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) query = '%s %s' % (title, hdlr) # query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) #eztv has issues with dashes in titles query = re.sub(r'[^A-Za-z0-9\s\.]+', '', query) url = self.search_link % (quote_plus(query).replace('+', '-')) url = urljoin(self.base_link, url) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) html = client.request(url, timeout='5') try: tables = client.parseDOM(html, 'table', attrs={'class': 'forum_header_border'}) if not tables: return sources for table in tables: if 'magnet:' not in table: continue else: break except: source_utils.scraper_error('EZTV') return sources rows = re.findall(r'<tr\s*name\s*=\s*["\']hover["\']\s*class\s*=\s*["\']forum_header_border["\']>(.+?)</tr>', table, re.DOTALL | re.I) if not rows: return sources except: source_utils.scraper_error('EZTV') return sources for row in rows: try: try: columns = re.findall(r'<td\s.+?>(.+?)</td>', row, re.DOTALL) link = re.findall(r'href\s*=\s*["\'](magnet:[^"\']+)["\'].*title\s*=\s*["\'](.+?)["\']', columns[2], re.DOTALL | re.I)[0] except: continue url = str(client.replaceHTMLCodes(link[0]).split('&tr')[0]) try: url = unquote(url).decode('utf8') except: pass hash = re.compile(r'btih:(.*?)&', re.I).findall(url)[0] name = link[1].split(' [eztv]')[0].split(' Torrent:')[0] name = source_utils.clean_name(name) if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name(name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue try: seeders = int(re.findall(r'<font\s*color\s*=\s*["\'].+?["\']>(\d+|\d+\,\d+)</font>', columns[5], re.DOTALL)[0].replace(',', '')) if self.min_seeders > seeders: continue except: seeders = 0 quality, info = source_utils.get_release_quality(name_info, url) try: size = re.findall(r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', columns[3])[-1] dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({'provider': 'eztv', 'source': 'torrent', 'seeders': seeders, 'hash': hash, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize}) except: source_utils.scraper_error('EZTV') return sources
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper() data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) url = self.search_link % quote_plus(query) url = urljoin(self.base_link, url) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') posts = client.parseDOM(r, 'div', attrs={'class': 'post'}) if not posts: return sources except: source_utils.scraper_error('SCENERLS') return sources items = [] for post in posts: try: content = client.parseDOM(post, "div", attrs={"class": "postContent"}) size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', content[0])[0] u = client.parseDOM(content, "h2") u = client.parseDOM(u, 'a', ret='href') u = [(i.strip('/').split('/')[-1], i, size) for i in u] items += u except: source_utils.scraper_error('SCENERLS') return sources for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) if not source_utils.check_title(title, aliases, name, hdlr, year): continue name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) if source_utils.remove_lang(name_info): continue # check year for reboot/remake show issues if year is available-crap shoot # if 'tvshowtitle' in data: # if re.search(r'([1-3][0-9]{3})', name): # if not any(value in name for value in [year, str(int(year)+1), str(int(year)-1)]): # continue url = py_tools.ensure_text(client.replaceHTMLCodes(str( item[1])), errors='replace') if url in str(sources): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name_info, url) try: dsize, isize = source_utils._size(item[2]) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'scenerls', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) except: source_utils.scraper_error('SCENERLS') return sources
def sources(self, url, hostDict): sources = [] if not url: return sources try: scraper = cfscrape.create_scraper(delay=5) data = parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] title = title.replace('&', 'and').replace('Special Victims Unit', 'SVU') aliases = data['aliases'] episode_title = data['title'] if 'tvshowtitle' in data else None year = data['year'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else year isSeasonQuery = False query = '%s %s' % (title, hdlr) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query) # query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', query) query = re.sub(r'\s', '-', query) if int(year) >= 2021: self.base_link = self.base_new else: self.base_link = self.base_old url = urljoin(self.base_link, query) # log_utils.log('url = %s' % url, log_utils.LOGDEBUG) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') if not r or 'nothing was found' in r: if 'tvshowtitle' in data: season = re.search(r'S(.*?)E', hdlr).group(1) query = re.sub(r'(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', title) # query = re.sub(r'[^A-Za-z0-9\s\.-]+', '', title) query = re.sub(r'\s', '-', query) query = query + "-S" + season url = urljoin(self.base_link, query) # r = scraper.get(url).content r = py_tools.ensure_str(scraper.get(url).content, errors='replace') isSeasonQuery = True else: return sources if not r or 'nothing was found' in r: return sources # may need to add fallback to use self.search_link if nothing found posts = client.parseDOM(r, "div", attrs={"class": "content"}) if not posts: return sources except: source_utils.scraper_error('RLSBB') return sources release_title = re.sub(r'[^A-Za-z0-9\s\.-]+', '', title).replace(' ', '.') items = [] count = 0 for post in posts: if count >= 300: break # to limit large link list and slow scrape time try: post_titles = re.findall( r'(?:.*>|>\sRelease Name.*|\s)(%s.*?)<' % release_title, post, re.I ) #parse all matching release_titles in each post(content) group items = [] if len(post_titles) > 1: index = 0 for name in post_titles: start = post_titles[index].replace('[', '\\[').replace( '(', '\\(').replace(')', '\\)').replace( '+', '\\+').replace(' \\ ', ' \\\\ ') end = (post_titles[index + 1].replace( '[', '\\[').replace('(', '\\(').replace( ')', '\\)').replace('+', '\\+')).replace( ' \\ ', ' \\\\ ' ) if index + 1 < len(post_titles) else '' try: container = re.findall( r'(?:%s)([\S\s]+)(?:%s)' % (start, end), post, re.I )[0] #parse all data between release_titles in multi post(content) group except: source_utils.scraper_error('RLSBB') continue try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', container)[0].replace(',', '.') except: size = '0' container = client.parseDOM(container, 'a', ret='href') items.append((name, size, container)) index += 1 elif len(post_titles) == 1: name = post_titles[0] container = client.parseDOM( post, 'a', ret='href' ) #parse all links in a single post(content) group try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', post)[0].replace(',', '.') except: size = '0' items.append((name, size, container)) else: continue for group_name, size, links in items: for i in links: name = group_name # if isSeasonQuery and hdlr not in name.upper(): # name = i.rsplit("/", 1)[-1] # if hdlr not in name.upper(): continue if hdlr not in name.upper(): name = i.rsplit("/", 1)[-1] if hdlr not in name.upper(): continue name = client.replaceHTMLCodes(name) name = source_utils.strip_non_ascii_and_unprintable( name) name_info = source_utils.info_from_name( name, title, year, hdlr, episode_title) url = py_tools.ensure_text(client.replaceHTMLCodes( str(i)), errors='replace') if url in str(sources): continue if url.endswith(('.rar', '.zip', '.iso', '.part', '.png', '.jpg', '.bmp', '.gif')): continue valid, host = source_utils.is_host_valid(url, hostDict) if not valid: continue quality, info = source_utils.get_release_quality( name, url) try: if size == '0': try: size = re.findall( r'((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GB|GiB|Gb|MB|MiB|Mb))', name)[0].replace(',', '.') except: raise Exception() dsize, isize = source_utils._size(size) info.insert(0, isize) except: dsize = 0 info = ' | '.join(info) sources.append({ 'provider': 'rlsbb', 'source': host, 'name': name, 'name_info': name_info, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True, 'size': dsize }) count += 1 except: source_utils.scraper_error('RLSBB') return sources