def _link(self, url, index): try: html = BeautifulSoup(client.request(url)) buttons = html.find_all('div', class_='download-btn') link = None links = [] for i in buttons: try: links.append(i.find_all('a')[0].get('href')) except: pass for i in links: if i.startswith('magnet:'): link = i break if not link: for i in links: if i.startswith('/download'): link = urlparse.urljoin(self.base_link, i) break self.tLock.acquire() self.tSources[index]['url'] = link except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def _verifyAccountsImdb(self, checkDisabled = True): name = 'IMDb' if self.__done(name): return try: if not checkDisabled or self.__enabled('accounts.informants.imdb.enabled'): link = 'http://www.imdb.com/user/ur%s/watchlist' % tools.Settings.getString('accounts.informants.imdb.user').replace('ur', '') data = client.request(link) if data: indexStart = data.find('IMDbReactInitialState.push(') # Somtimes the page is not fully rendered yet and the JSON is still in a JS tag. if indexStart < 0: # Data was rendered into the HTML. data = BeautifulSoup(data) if len(data.find_all('div', class_ = 'error_code_404')) > 0: status = self.StatusFailure elif len(data.find_all('div', id = 'unavailable')) > 0: status = self.StatusLimited elif len(data.find_all('div', class_ = 'lister-widget')) > 0: status = self.StatusOperational else: status = self.StatusFailure else: # Data still in JS. indexStart += 27 indexEnd = data.find(');', indexStart) data = json.loads(data[indexStart : indexEnd]) if 'titles' in data and len(data['titles'].values()) > 0: status = self.StatusOperational else: status = self.StatusLimited else: # Wrong user ID, returns 404 error. status = self.StatusFailure else: status = self.StatusDisabled except: status = self.StatusFailure return self.__append(name, status)
def resolve(self, url): try: html = BeautifulSoup(client.request(url)) html = html.find_all('ul', class_='download-links-dontblock')[0] return html.find_all('a')[0]['href'] except: return None
def resolve(self, url): html = BeautifulSoup(client.request(url)) htmlLinks = html.find_all('a') for htmlLink in htmlLinks: link = htmlLink['href'] if link.startswith('magnet:'): return link return None
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = self._decode(url) if 'exact' in data and data['exact']: query = title = data['title'] titles = None year = None else: title = data['title'] titles = data['alternatives'] if 'alternatives' in data else None year = int(data['year']) if 'year' in data and not data['year'] == None else None query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) if not self._query(query): return sources url = urlparse.urljoin(self.base_link, self.search_link) % urllib.quote_plus(query) html = BeautifulSoup(client.request(url)) htmlTable = html.find_all('div', id = 'Torrents')[0].find_all('div', class_ = 'DownloadFlags')[0] htmlRows = htmlTable.find_all('a', recursive = False) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(1, len(htmlRows)): # Skip first entry try: htmlRow = htmlRows[i] htmlData = htmlRow['onmouseover'].split(',') if not len(htmlData) == 11: continue # Name htmlName = htmlData[5].strip().strip("'") # Link htmlLink = htmlRow['href'].strip() htmlLink = re.search('\/.*\/(.*)\.aspx', htmlLink).group(1).replace('-', '.') htmlLink = urlparse.urljoin(self.base_link, self.download_link) % urllib.quote_plus(htmlLink) # Size htmlSize = htmlData[7].strip().strip("'") # Metadata meta = metadata.Metadata(name = htmlName, title = title, titles = titles, year = year, link = htmlLink, size = htmlSize, seeds = 1) # Ignore meta.mIgnoreLength = 10 if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) except: pass return sources except: return sources
def resolve(self, url): try: html = BeautifulSoup(client.request(url)) html = html.find_all('table', class_='list')[0] html = html.find_all('tr', recursive=False)[4] return self.base_link + html.find_all('a')[0]['href'] except: tools.Logger.error() return None
def _link(self, link): try: html = BeautifulSoup(client.request(link)) html = html.find_all('table', class_='list')[0] html = html.find_all('tr', recursive=False)[4] resolved = self.base_link + html.find_all('a')[0]['href'] self.tLock.acquire() self.tLinks[link] = resolved self.tLock.release() except: tools.Logger.error()
def _hash(self, url, index): try: htmlSingle = BeautifulSoup(client.request(url)) htmlInfo = htmlSingle.find('table', 'informations') htmlHash = htmlInfo.find_all('tr')[4].find_all('td')[1].getText() self.tLock.acquire() if htmlHash: self.tSources[index]['hash'] = htmlHash except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def _link(self, url, index): try: html = BeautifulSoup(client.request(url)) htmlCollection = html.find('input', {'name': 'collection'})['value'] htmlUuid = html.find('input', {'name': 'uuid'})['value'] self.tLock.acquire() if htmlUuid and htmlCollection: self.tSources[index]['url'] = urlparse.urljoin(self.base_link, self.download_link) % (htmlCollection, htmlUuid) except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def _link(self, link): try: html = BeautifulSoup(client.request(link)) htmlLinks = html.find_all('a') for i in range(len(htmlLinks)): resolved = htmlLinks[i]['href'] if resolved.lower().startswith('magnet:'): self.tLock.acquire() self.tLinks[link] = resolved self.tLock.release() break except: pass
def _link(self, url, index): try: html = BeautifulSoup(client.request(url)) html = html.find_all('div', class_='download-btn')[0] link = html.find_all('a')[0] self.tLock.acquire() self.tLinks[index] = self.base_link + link['href'] except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def _link(self, url, index): try: html = BeautifulSoup(client.request(url)) html = html.find_all('div', class_ = 'details')[0] links = html.find_all('a') for link in links: if link['href'].startswith('magnet:'): self.tLock.acquire() self.tLinks[index] = link['href'] break except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = resources.lib.externals.beautifulsoup.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None resources.lib.externals.beautifulsoup.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = BeautifulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) resources.lib.externals.beautifulsoup.dammit.chardet_dammit = chardet
def _link(self, url, index): try: html = BeautifulSoup(client.request(url)) links = html.find_all('a') link = None for i in links: i = i.get('href') if i.startswith('magnet:'): link = i break self.tLock.acquire() self.tSources[index]['url'] = link except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def _verifyAccountsImdb(self, checkDisabled=True, user=None): name = 'IMDb' if self.__done(name): return try: if not checkDisabled or self.__enabled( 'accounts.informants.imdb.enabled'): if user == None: user = tools.Settings.getString( 'accounts.informants.imdb.user').replace('ur', '') link = 'http://www.imdb.com/user/ur%s/watchlist' % user data = client.request(link) if data: indexStart = data.find( 'IMDbReactInitialState.push(' ) # Somtimes the page is not fully rendered yet and the JSON is still in a JS tag. if indexStart < 0: # Data was rendered into the HTML. data = BeautifulSoup(data) if len(data.find_all('div', class_='error_code_404')) > 0: status = Verification.StatusFailure elif len(data.find_all('div', id='unavailable')) > 0: status = Verification.StatusLimited elif len(data.find_all('div', class_='lister-widget')) > 0: status = Verification.StatusOperational else: status = Verification.StatusFailure else: # Data still in JS. indexStart += 27 indexEnd = data.find(');', indexStart) data = json.loads(data[indexStart:indexEnd]) if 'titles' in data and len( data['titles'].values()) > 0: status = Verification.StatusOperational else: status = Verification.StatusLimited else: # Wrong user ID, returns 404 error. status = Verification.StatusFailure else: status = Verification.StatusDisabled except: status = Verification.StatusFailure return self.__append(name=name, status=status)
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) % urllib.quote_plus(query) html = BeautifulSoup(client.request(url)) htmlTable = None tables = html.find_all('table', class_ = 'forum_header_border') for table in tables: try: row = table.find_all('tr')[1] headers = row.find_all('td', class_ = 'forum_thread_header') if headers[0].getText() == 'Show' and headers[5].getText() == 'Seeds': htmlTable = table break except: pass if htmlTable == None: raise Exception() htmlRows = htmlTable.find_all('tr', recursive = False) # Use children and no further. for i in range(2, len(htmlRows)): # First two rows are the headers. htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Use children and no further. htmlInfo = htmlColumns[1] # Name htmlName = htmlInfo.find_all('a', class_ = 'epinfo')[0]['title'].strip() # Size try: htmlSize = htmlColumns[3].getText() # Does not always have size. except: htmlSize = None # Link htmlLink = htmlColumns[2].find_all('a', class_ = 'magnet')[0]['href'] # Seeds try: htmlSeeds = htmlColumns[5].find_all('font')[0].getText() # Does not always have seeds. except: htmlSeeds = None # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(False): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) return sources except: return sources
def _search(self, url, query, show, type, title, titles, year, season, episode, pack, packCount, packException, ignoreContains): pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 0 added = False try: while True: pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break html = BeautifulSoup(client.request(url % (type, query))) page += 1 added = False htmlTable = html.find_all('table', class_='table-corps') if len(htmlTable) > 0: htmlTable = htmlTable[0] try: htmlTable = htmlTable.find_all('tbody', recursive=False)[0] except: pass htmlRows = htmlTable.find_all('tr', recursive=False) for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive=False) # Name htmlName = htmlColumns[0].find_all( 'a')[0].getText().strip() # Link htmlLink = urlparse.urljoin( self.base_link, htmlColumns[0].find_all('a') [0].get('href').encode('utf-8')) # Size htmlSize = re.sub( '([mMkKgGtT]?)[oO]', '\\1b', htmlColumns[0].find_all( 'div', class_='poid')[0].getText()) if not 'b' in htmlSize: htmlSize = htmlSize + ' mb' # Seeds try: htmlSeeds = int(htmlColumns[0].find_all( 'div', class_='up')[0].getText().strip()) except: htmlSeeds = None # Metadata meta = metadata.Metadata(name=htmlName, title=title, titles=titles, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore meta.ignoreAdjust(contains=ignoreContains) if meta.ignore(True, season=not packException): continue # Add self.tLock.acquire() self.tSources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) self.tLock.release() added = True self.tLock.acquire() thread = threading.Thread( target=self._link, args=(htmlLink, len(self.tSources) - 1)) self.tThreadsLinks.append(thread) self.tLock.release() thread.start() # Only shows 1 page. break except: tools.Logger.error() finally: try: self.tLock.release() except: pass
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() ignoreContains = None data = self._decode(url) if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = None year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] titles = data[ 'alternatives'] if 'alternatives' in data else None year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: # Search special episodes by name. All special episodes are added to season 0 by Trakt and TVDb. Hence, do not search by filename (eg: S02E00), since the season is not known. if (season == 0 or episode == 0) and ('title' in data and not data['title'] == None and not data['title'] == ''): title = '%s %s' % ( data['tvshowtitle'], data['title'] ) # Change the title for metadata filtering. query = title ignoreContains = len(data['title']) / float( len(title) ) # Increase the required ignore ration, since otherwise individual episodes and season packs are found as well. else: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) if not self._query(query): return sources query = urllib.quote_plus(query) category = self.category_shows if 'tvshowtitle' in data else self.category_movies url = urlparse.urljoin(self.base_link, self.search_link) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 added = False timerEnd = tools.Settings.getInteger( 'scraping.providers.timeout') - 8 timer = tools.Time(start=True) ''' while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (category, query, page) html = client.request(urlNew) # Demonoid does not have a closing tag for the rows. # This causes BeautifulSoup to only detect the first row. # Manually add a closing </tr> tag, except fore the first row. html = html.replace('<tr align="left" bgcolor="#CCCCCC">', '<tr align="left" bgcolor="">', 1) html = html.replace('<tr align="left" bgcolor="#CCCCCC">', '</tr><tr align="left" bgcolor="#CCCCCC">') html = BeautifulSoup(html) page += 1 added = False htmlTable = html.find_all('td', class_ = 'ctable_content_no_pad')[0].find_all('table', recursive = False)[1] htmlRows = html.find_all('tr') i = 0 while i < len(htmlRows): try: htmlRow = htmlRows[i] i += 1 # Normal loop increment. if len(htmlRow.find_all('td', {'rowspan' : '2'})) == 0: continue # Name htmlName = htmlRow.find_all('td', {'colspan' : '9'})[0].find_all('a')[0].getText().strip() htmlRow = htmlRows[i] i += 1 # Go to next row, because items are split over to lines. # Size htmlSize = htmlColumns[3].getText().strip() # Link htmlLink = htmlColumns[2].find_all('a')[0]['href'] # Seeds htmlSeeds = int(htmlColumns[6].getText().strip()) items = htmlColumns[0].find_all('a') # Release try: htmlRelease = items[1].getText() if not 'other' in htmlRelease.lower(): htmlName += ' ' + htmlRelease except: pass # Language try: htmlLanguage = items[2].getText() except: htmlLanguage = None # Metadata meta = metadata.Metadata(name = htmlName, title = title, titles = titles, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, seeds = htmlSeeds, languageAudio = htmlLanguage) # Ignore meta.ignoreAdjust(contains = ignoreContains) if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) added = True except: pass ''' while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (category, query, page) html = client.request(urlNew) page += 1 added = False htmlRows = re.findall( '<!--\s*tstart\s*-->(.*?)<tr\s*align="left"\s*bgcolor="#CCCCCC">', html, re.M | re.S) htmlRows = ['<tr><td>' + i for i in htmlRows] for htmlRow in htmlRows: try: htmlRow = BeautifulSoup(htmlRow) htmlColumns = htmlRow.find_all('td') # Name htmlName = htmlRow.find_all('a')[1].getText().strip() # Size htmlSize = htmlColumns[4].getText().strip() # Link htmlLink = htmlRow.find_all('a')[1]['href'] htmlLink = urlparse.urljoin(self.base_link, htmlLink) htmlLink = re.search('genidy=(.*)', htmlLink, re.IGNORECASE) if not htmlLink: continue htmlLink = self.download_link % htmlLink.group(1) # Seeds try: htmlSeeds = int(htmlColumns[7].getText().strip()) except: htmlSeeds = 0 items = htmlColumns[0].find_all('a') # Metadata meta = metadata.Metadata(name=htmlName, title=title, titles=titles, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore meta.ignoreAdjust(contains=ignoreContains) if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) added = True except: pass if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() ignoreContains = None data = self._decode(url) if 'exact' in data and data['exact']: query = title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = None year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = data['alternatives'] if 'alternatives' in data else None year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: # Search special episodes by name. All special episodes are added to season 0 by Trakt and TVDb. Hence, do not search by filename (eg: S02E00), since the season is not known. if (season == 0 or episode == 0) and ('title' in data and not data['title'] == None and not data['title'] == ''): title = '%s %s' % (data['tvshowtitle'], data['title']) # Change the title for metadata filtering. query = title ignoreContains = len(data['title']) / float(len(title)) # Increase the required ignore ration, since otherwise individual episodes and season packs are found as well. else: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) query = urllib.quote_plus(query) if not self._query(query): return sources pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = (self.base_link + self.search_link) % (query, page) html = BeautifulSoup(client.request(urlNew)) htmlTable = html.find_all('table', class_ = 'search-table')[0] htmlRows = htmlTable.find_all('tr', recursive = False) page += 1 added = False for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Name htmlName = htmlColumns[0].getText().strip() # Size htmlSize = htmlColumns[2].getText().strip() # Link htmlLink = htmlColumns[0].find_all('a')[0]['href'].strip() htmlLink = network.Container(htmlLink).torrentMagnet(title = title) # Seeds htmlSeeds = int(htmlColumns[3].getText().strip()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, titles = titles, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore meta.ignoreAdjust(contains = ignoreContains) if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), page) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False htmlTable = html.find_all('span', id = 'ui_searchResult')[0] htmlRows = htmlTable.find_all('div', class_ = 'panel-body') for i in range(len(htmlRows)): htmlRow = htmlRows[i].find_all('div', class_ = 'media', recursive = False)[0].find_all('div', class_ = 'row', recursive = False)[0] htmlColumns = htmlRow.find_all('div', recursive = False) # Use children and no further. htmlInfo = htmlColumns[0] # Name htmlName = htmlInfo.find_all('a', class_ = 'text-primary') if len(htmlName) == 0: # 'Dangerous' items (encrypted or incompelte - see below at the ignore section) have a text-muted class and are already filtered out here. continue else: htmlName = htmlName[0].getText() # Size htmlSize = htmlColumns[1].getText().replace(' ', ' ') htmlSize = htmlSize.splitlines()[0] # Otherwise the find function does not work. indexEnd = htmlSize.find(' ', htmlSize.find(' ') + 1) # Second index htmlSize = htmlSize[: indexEnd] # Link htmlId = htmlColumns[3].find_all('div', class_ = 'author-info')[0].find_all('div') for id in htmlId: if id.has_attr('collectionid'): htmlId = id['collectionid'] break htmlLink = self.base_link + self.download_link + htmlId # Age htmlAge = htmlColumns[2].getText() htmlAge = int(htmlAge[: htmlAge.find(' ')]) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, age = htmlAge) # Ignore if meta.ignore(False): continue htmlDanger = htmlInfo.find_all('small')[1].find_all('span', _class = 'text-danger') ignore = False for danger in htmlDanger: danger = danger['title'] if danger.startswith('incomplete'): # Ignore files marked as incomplete. ignore = True if danger.find('password') >= 0 or htmlDanger.find('encrypted') >= 0: # Ignore password-protected files ignore = True if ignore: continue # Add # Some NZBs have the wrong size (often a few KB) indicated on the site, but are in reaility bigger. Hence, do not show the size of NZBs below 20MB, but still add them. sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'usenet', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(sizeLimit = 20971520), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) category = self.category_shows if 'tvshowtitle' in data else self.category_movies url = urlparse.urljoin(self.base_link, self.search_link) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger( 'scraping.providers.timeout') - 8 timer = tools.Time(start=True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (category, urllib.quote_plus(query), page) html = client.request(urlNew) # HTML is corrupt. Try to fix it manually. try: indexStart = html.find('class="table2"') indexStart = html.find('<tr bgcolor', indexStart) indexEnd = html.find('search_stat', indexStart) html = html[indexStart:indexEnd] indexEnd = html.rfind('</td>') + 5 html = html[:indexEnd] html = html.replace('</a></td>', '</td>') html = '<table>' + html + '</tr></table>' except: pass html = BeautifulSoup(html) page += 1 added = False htmlRows = html.find_all( 'tr' ) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[0].find_all('div')[0] # Name htmlName = htmlInfo.find_all( 'a', recursive=False)[1].getText().strip() # Link htmlHash = htmlInfo.find_all('a', recursive=False)[0]['href'] indexStart = htmlHash.find('torrent/') if indexStart < 0: continue indexStart += 8 indexEnd = htmlHash.find('.torrent', indexStart) if indexEnd < 0: continue htmlHash = htmlHash[indexStart:indexEnd] if not tools.Hash.valid(htmlHash): continue htmlLink = network.Container(htmlHash).torrentMagnet( title=query) # Size htmlSize = htmlColumns[2].getText().strip() # Seeds htmlSeeds = int(htmlColumns[3].getText().replace( ',', '').replace(' ', '')) # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() if not (self.enabled and self.username and not self.username == '' and self.password and not self.password == ''): raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) # Login login = urlparse.urljoin(self.base_link, '/login') post = urllib.urlencode({'username': self.username, 'password': self.password, 'submit': 'Login'}) cookie = client.request(login, post = post, output = 'cookie', close = False) response = client.request(login, post = post, cookie = cookie, output = 'extended') headers = {'User-Agent': response[3]['User-Agent'], 'Cookie': response[3]['Cookie']} url = urlparse.urljoin(self.base_link, self.search_link) type = self.type_tvshows if 'tvshowtitle' in data else self.type_movies offset = 0 timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), type, offset) html = BeautifulSoup(client.request(urlNew, cookie = cookie)) offset += self.offset htmlTable = html.find_all('table', id = 'browsetable')[0] # Will fail if on last page and the table is not present. htmlRows = htmlTable.find_all('tr', recursive = False) # Use children and no further. for i in range(1, len(htmlRows)): # First row is the header. htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Use children and no further. htmlInfo = htmlColumns[1] # Name htmlName = htmlInfo.find_all('a', class_ = 'title')[0].getText().strip() # Size htmlSize = htmlColumns[4].getText() indexEnd = htmlSize.find('<br') if indexEnd >= 0: htmlSize = htmlSize[: indexEnd].replace('"', '') # Link htmlLink = self.base_link + htmlColumns[7].find_all('div', class_ = 'icon_nzb')[0].find_all('a')[0]['href'] urlparse.urljoin(self.base_link, htmlLink) htmlLink += '|' + urllib.urlencode(headers) # Age htmlAge = htmlColumns[3]['title'] htmlAge = tools.Time.datetime(htmlAge, '%Y-%m-%d %H:%M:%S') htmlAge = datetime.datetime.today() - htmlAge htmlAge = htmlAge.days # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, age = htmlAge) # Ignore if meta.ignore(False): continue # Ignore Incomplete try: htmlComplete = htmlColumns[4].find_all('span', class_ = 'label-success')[0].getText() if not '100' in htmlComplete: continue except: pass # Ignore Foreign if self.exclude_foreign: htmlCategory = htmlColumns[2].find_all('a')[0].getText() if 'foreign' in htmlCategory.lower(): continue # Add # Some NZBs have the wrong size (often a few KB) indicated on the site, but are in reaility bigger. Hence, do not show the size of NZBs below 20MB, but still add them. sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'usenet', 'memberonly' : True, 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode'] ) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 0 # Pages start at 0 added = False #while True: while page == 0: # KickassTorrents currently has a problem to view any other page than page 1 while sorted by seeders. Only view first page. urlNew = url % (urllib.quote_plus(query)) html = client.request(urlNew) # KickassTorrents has major mistakes in their HTML. manually remove parts to create new HTML. indexStart = html.find('<', html.find('<!-- Start of Loop -->') + 1) indexEnd = html.rfind('<!-- End of Loop -->') html = html[indexStart:indexEnd] html = html.replace('<div class="markeredBlock', '</div><div class="markeredBlock' ) # torrentname div tag not closed. html = html.replace('</span></td>', '</td>') # Dangling </span> closing tag. html = BeautifulSoup(html) page += 1 added = False htmlRows = html.find_all( 'tr', recursive=False ) # Do not search further down the tree (just the direct children). for i in range(len(htmlRows)): htmlRow = htmlRows[i] if 'firstr' in htmlRow['class']: # Header. continue htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[0] # Name htmlName = htmlInfo.find_all( 'a', class_='cellMainLink')[0].getText().strip() # Size htmlSize = htmlColumns[1].getText().replace(' ', ' ') # Link htmlLink = '' htmlLinks = htmlInfo.find_all('a') for j in range(len(htmlLinks)): link = htmlLinks[j] if link.has_attr('href'): link = link['href'] if link.startswith('magnet:'): htmlLink = link break # Seeds htmlSeeds = int(htmlColumns[3].getText()) # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'info': meta.information(), 'file': htmlName }) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def account(self, cached = True, minimal = False): account = None try: if self.accountValid(): import datetime from resources.lib.externals.beautifulsoup import BeautifulSoup if cached: accountHtml = cache.Cache().cacheShort(self._request, Core.LinkAccount) else: accountHtml = cache.Cache().cacheClear(self._request, Core.LinkAccount) if accountHtml == None or accountHtml == '': raise Exception() accountHtml = BeautifulSoup(accountHtml) accountHtml = accountHtml.find_all('form', id = 'accountForm')[0] accountHtml = accountHtml.find_all('table', recursive = False)[0] accountHtml = accountHtml.find_all('tr', recursive = False) accountUsername = accountHtml[0].find_all('td', recursive = False)[1].getText() accountType = accountHtml[1].find_all('td', recursive = False)[2].getText() accountStatus = accountHtml[3].find_all('td', recursive = False)[2].getText() accountExpiration = accountHtml[2].find_all('td', recursive = False)[2].getText() accountTimestamp = convert.ConverterTime(accountExpiration, format = convert.ConverterTime.FormatDate).timestamp() accountExpiration = datetime.datetime.fromtimestamp(accountTimestamp) account = { 'user' : accountUsername, 'type' : accountType, 'status' : accountStatus, 'expiration' : { 'timestamp' : accountTimestamp, 'date' : accountExpiration.strftime('%Y-%m-%d'), 'remaining' : (accountExpiration - datetime.datetime.today()).days, } } if not minimal: if cached: usageHtml = cache.Cache().cacheShort(self._request, Core.LinkUsage) else: usageHtml = cache.Cache().cacheClear(self._request, Core.LinkUsage) if usageHtml == None or usageHtml == '': raise Exception() usageHtml = BeautifulSoup(usageHtml) usageHtml = usageHtml.find_all('div', class_ = 'table-responsive')[0] usageHtml = usageHtml.find_all('table', recursive = False)[0] usageHtml = usageHtml.find_all('tr', recursive = False) usageTotal = usageHtml[0].find_all('td', recursive = False)[1].getText() index = usageTotal.find('(') if index >= 0: usageTotal = int(usageTotal[index + 1 : usageTotal.find(' ', index)].replace(',', '').strip()) else: usageTotal = 0 usageConsumed = usageHtml[1].find_all('td', recursive = False)[2].getText() index = usageConsumed.find('(') if index >= 0: usageConsumed = int(usageConsumed[index + 1 : usageConsumed.find(' ', index)].replace(',', '').strip()) else: usageConsumed = 0 usageWeb = usageHtml[2].find_all('td', recursive = False)[2].getText() index = usageWeb.find('(') if index >= 0: usageWeb = int(usageWeb[index + 1 : usageWeb.find(' ', index)].replace(',', '').strip()) else: usageWeb = 0 usageNntp = usageHtml[3].find_all('td', recursive = False)[2].getText() index = usageNntp.find('(') if index >= 0: usageNntp = int(usageNntp[index + 1 : usageNntp.find(' ', index)].replace(',', '').strip()) else: usageNntp = 0 usageNntpUnlimited = usageHtml[4].find_all('td', recursive = False)[2].getText() index = usageNntpUnlimited.find('(') if index >= 0: usageNntpUnlimited = int(usageNntpUnlimited[index + 1 : usageNntpUnlimited.find(' ', index)].replace(',', '').strip()) else: usageNntpUnlimited = 0 usageRemaining = usageHtml[5].find_all('td', recursive = False)[2].getText() index = usageRemaining.find('(') if index >= 0: usageRemaining = int(usageRemaining[index + 1 : usageRemaining.find(' ', index)].replace(',', '').strip()) else: usageRemaining = 0 usageLoyalty = usageHtml[6].find_all('td', recursive = False)[2].getText() index = usageLoyalty.find('(') if index >= 0: usageLoyaltyTime = usageLoyalty[:index].strip() usageLoyaltyTimestamp = convert.ConverterTime(usageLoyaltyTime, format = convert.ConverterTime.FormatDate).timestamp() usageLoyaltyTime = datetime.datetime.fromtimestamp(usageLoyaltyTimestamp) usageLoyaltyPoints = float(usageLoyalty[index + 1 : usageLoyalty.find(')', index)].strip()) else: usageLoyaltyTimestamp = 0 usageLoyaltyTime = None usagePrecentageRemaining = usageRemaining / float(usageTotal) usagePrecentageConsumed = usageConsumed / float(usageTotal) usagePrecentageWeb = usageWeb / float(usageTotal) usagePrecentageNntp = usageNntp / float(usageTotal) usagePrecentageNntpUnlimited = usageNntpUnlimited / float(usageTotal) account.update({ 'loyalty' : { 'time' : { 'timestamp' : usageLoyaltyTimestamp, 'date' : usageLoyaltyTime.strftime('%Y-%m-%d') }, 'points' : usageLoyaltyPoints, }, 'usage' : { 'total' : { 'size' : { 'bytes' : usageTotal, 'description' : convert.ConverterSize(float(usageTotal)).stringOptimal(), }, }, 'remaining' : { 'value' : usagePrecentageRemaining, 'percentage' : round(usagePrecentageRemaining * 100.0, 1), 'size' : { 'bytes' : usageRemaining, 'description' : convert.ConverterSize(float(usageRemaining)).stringOptimal(), }, 'description' : '%.0f%%' % round(usagePrecentageRemaining * 100.0, 0), # Must round, otherwise 2.5% changes to 2% instead of 3%. }, 'consumed' : { 'value' : usagePrecentageConsumed, 'percentage' : round(usagePrecentageConsumed * 100.0, 1), 'size' : { 'bytes' : usageConsumed, 'description' : convert.ConverterSize(usageConsumed).stringOptimal(), }, 'description' : '%.0f%%' % round(usagePrecentageConsumed * 100.0, 0), # Must round, otherwise 2.5% changes to 2% instead of 3%. 'web' : { 'value' : usagePrecentageWeb, 'percentage' : round(usagePrecentageWeb * 100.0, 1), 'size' : { 'bytes' : usageWeb, 'description' : convert.ConverterSize(usageWeb).stringOptimal(), }, 'description' : '%.0f%%' % round(usagePrecentageWeb * 100.0, 0), # Must round, otherwise 2.5% changes to 2% instead of 3%. }, 'nntp' : { 'value' : usagePrecentageNntp, 'percentage' : round(usagePrecentageNntp * 100.0, 1), 'size' : { 'bytes' : usageNntp, 'description' : convert.ConverterSize(usageNntp).stringOptimal(), }, 'description' : '%.0f%%' % round(usagePrecentageNntp * 100.0, 0), # Must round, otherwise 2.5% changes to 2% instead of 3%. }, 'nntpunlimited' : { 'value' : usagePrecentageNntpUnlimited, 'percentage' : round(usagePrecentageNntpUnlimited * 100.0, 1), 'size' : { 'bytes' : usageNntpUnlimited, 'description' : convert.ConverterSize(usageNntpUnlimited).stringOptimal(), }, 'description' : '%.0f%%' % round(usagePrecentageNntpUnlimited * 100.0, 0), # Must round, otherwise 2.5% changes to 2% instead of 3%. }, } } }) except: pass return account
def sources(self, url, hostDict, hostprDict): sources = [] try: if not tools.System.developers(): raise Exception() if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False category = self.category_tvshows if ('tvshowtitle' in data and not data['tvshowtitle'] == None and not data['tvshowtitle'] == '') else self.category_movies if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), category, page) data = client.request(urlNew) # RarBg's HTML is not valid and a total mess, prababley to make it hard for scrapers. # First try to parse the HTML. If it fails, extract only the table from the markup and construct new HTML. # Sometimes both fail, seems like RarBg randomizes the corruption in its HTML. htmlRows = [] try: html = BeautifulSoup(data) htmlTable = html.find_all('table', class_ = 'lista2t')[0] htmlRows = htmlTable.find_all('tr', class_ = 'lista2', recursive = False) if len(htmlRows) == 0: raise Exception() except: start = data.find('lista2t') if start < 0: raise Exception() start += 7 start = data.find('lista2', start) start = data.find('>', start) + 1 end = data.find('<tr><td align="center" colspan="2">', start) data = '<html><body><table class="lista2t"><tr class="lista2">' + data[start : end] + '</table></body></html>' html = BeautifulSoup(data) htmlTable = html.find_all('table', class_ = 'lista2t')[0] htmlRows = htmlTable.find_all('tr', class_ = 'lista2', recursive = False) page += 1 added = False for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[1] # Name htmlName = htmlInfo.find_all('a')[0].getText().strip() # 3D htmlImages = htmlInfo.find_all('img') for j in range(len(htmlImages)): try: if htmlImages[j]['src'].endswith('3d.png'): htmlName += ' 3D' break except: pass # Size htmlSize = htmlColumns[3].getText().strip() # Link # TODO: If the hash cannot be retrieved from the mouse-over image, fallback to the .torrent file. try: htmlLink = htmlInfo.find_all('a')[0]['onmouseover'] start = htmlLink.find('/over/') if start < 0: raise Exception() start += 6 end = htmlLink.find('.', start) htmlLink = htmlLink[start : end] if not len(htmlLink) == 40: raise Exception() htmlLink = self.magnet_link % (htmlLink, htmlName.replace(' ', '')) except: try: htmlLink = htmlInfo.find_all('a')[0]['href'] start = htmlLink.find('torrent/') if start < 0: raise Exception() start += 8 htmlLink = htmlLink[start:] if len(htmlLink) == 0: raise Exception() htmlLink = self.torrent_link % (htmlLink, htmlName.replace(' ', '')) except: continue # Seeds htmlSeeds = int(htmlColumns[4].getText().strip()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 0 # Pages start at 0 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), page) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False htmlTable = html.find_all('table', id = 'searchResult')[0] htmlRows = htmlTable.find_all('tr', recursive = False) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[1] # Name htmlName = htmlInfo.find_all('div', class_ = 'detName')[0].find_all('a')[0].getText().strip() # Size htmlSize = htmlInfo.find_all('font', class_ = 'detDesc')[0].getText().replace(' ', ' ') indexStart = htmlSize.find(', Size') indexEnd = htmlSize.find(', ', indexStart + 1) htmlSize = htmlSize[indexStart + 7 : indexEnd] # Link htmlLink = '' htmlLinks = htmlInfo.find_all('a') for j in range(len(htmlLinks)): link = htmlLinks[j]['href'] if link.startswith('magnet:'): htmlLink = link break # Seeds htmlSeeds = int(htmlColumns[2].getText()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) type = self.type_tvshows if ('tvshowtitle' in data and not data['tvshowtitle'] == None and not data['tvshowtitle'] == '') else self.type_movies title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] titleYear = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s (%s)' % (data['title'], data['year']) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), type, page) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False htmlTable = html.find_all('div', id = 'div2child')[0] htmlRows = htmlTable.find_all('div', class_= 'resultdiv', recursive = False) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlInfo = htmlRow.find_all('div', class_= 'resultdivbotton')[0] # Name htmlName = htmlRow.find_all('div', class_= 'resultdivtop')[0].find_all('div', class_= 'resultdivtopname')[0].getText().strip() # Size htmlSize = htmlInfo.find_all('div', class_= 'resultlength')[0].find_all('div', class_= 'resultdivbottonlength')[0].getText() # Link htmlHash = htmlInfo.find_all('div', class_= 'hideinfohash')[0].getText() htmlLink = network.Container(htmlHash).torrentMagnet(title = titleYear) # Seeds htmlSeeds = int(htmlInfo.find_all('div', class_= 'resultseed')[0].find_all('div', class_= 'resultdivbottonseed')[0].getText()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False category = self.category_show if 'tvshowtitle' in data else self.category_movie if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) querySplit = query.split() # Login if self.enabled and self.username and not self.username == '' and self.password and not self.password == '': login = self.base_link + self.login_link post = urllib.urlencode({'username': self.username, 'password': self.password, 'submit': 'submit'}) cookie = client.request(login, post = post, output = 'cookie', close = False) response = client.request(login, post = post, cookie = cookie, output = 'extended') headers = {'User-Agent': response[3]['User-Agent'], 'Cookie': response[3]['Cookie']} else: cookie = None headers = None url = urlparse.urljoin(self.base_link, self.search_link) page = 1 added = False firstLink = None timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), urllib.quote_plus(category), page) html = BeautifulSoup(client.request(urlNew, cookie = cookie)) page += 1 added = False htmlTable = html.find_all('table', id = 'torrenttable')[0].find_all('tbody')[0] htmlRows = htmlTable.find_all('tr', recursive = False) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): htmlRow = htmlRows[i] # Name htmlName = htmlRow.find_all('td', class_ = 'name', recursive = False)[0] htmlName = htmlName.find_all('span', class_ = 'title', recursive = False)[0] htmlName = htmlName.find_all('a')[0].getText().strip() # Link htmlLink = htmlRow.find_all('td', class_ = 'quickdownload', recursive = False)[0] htmlLink = htmlLink.find_all('a')[0]['href'] # Continuing with page will always show the torrents of the last page. # Stop once the first link is the same. if i == 0: if firstLink == htmlLink: break firstLink = htmlLink if not headers == None: htmlLink += '|' + urllib.urlencode(headers) # Size htmlSize = htmlRow.find_all('td')[4].getText().strip() # Seeds htmlSeeds = htmlRow.find_all('td', class_ = 'seeders')[0].getText().strip() # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) pack = None if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: # Only this format works for season packs. # Does not support individual episodes. if pack: query = '%s S%02d' % (title, season) else: pack = True query = '%s сезон %d' % (title, season) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) % urllib.quote_plus(query) html = BeautifulSoup(client.request(url)) htmlTable = html.find_all( 'table', class_='tablesorter')[0].find_all('tbody', recursive=False)[0] htmlRows = htmlTable.find_all('tr', recursive=False) for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') # Name htmlName = htmlColumns[1].find_all('a')[0].getText().strip() # Link htmlLink = self.base_link + self.download_link + htmlColumns[ 2].find_all('a')[0]['href'] # Size htmlSize = long( htmlColumns[3].find_all('u')[0].getText().strip()) # Seeds try: htmlSeeds = int(htmlColumns[4].getText().strip()) except: htmlSeeds = None # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName, 'pack': pack }) return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False category = self.category_shows if 'tvshowtitle' in data else self.category_movies if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (category, urllib.quote_plus(query), page) html = client.request(urlNew) # HTML is corrupt. Try to fix it manually. indexStart = html.find('class="table2"') indexStart = html.find('<tr bgcolor', indexStart) indexEnd = html.find('search_stat', indexStart) html = html[indexStart : indexEnd] indexEnd = html.rfind('</td>') + 5 html = html[:indexEnd] html = html.replace('</a></td>', '</td>') html = '<table>' + html + '</tr></table>' html = BeautifulSoup(html) page += 1 added = False htmlRows = html.find_all('tr') # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[0].find_all('div')[0] # Name htmlName = htmlInfo.find_all('a', recursive = False)[1].getText().strip() # Link htmlHash = htmlInfo.find_all('a', recursive = False)[0]['href'] indexStart = htmlHash.find('torrent/') if indexStart < 0: continue indexStart += 8 indexEnd = htmlHash.find('.torrent', indexStart) if indexEnd < 0: continue htmlHash = htmlHash[indexStart : indexEnd] if not tools.Hash.valid(htmlHash): continue htmlLink = network.Container(htmlHash).torrentMagnet(title = query) # Size htmlSize = htmlColumns[2].getText().strip() # Seeds htmlSeeds = int(htmlColumns[3].getText().replace(',', '').replace(' ', '')) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): self.tSources = [] try: if url == None: raise Exception() if not self.enabled or self.username == '' or self.password == '': raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) show = 'tvshowtitle' in data title = data['tvshowtitle'] if show else data['title'] titleYear = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if show else '%s (%s)' % (data['title'], data['year']) if 'exact' in data and data['exact']: query = title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if show: subcategory = self.subcategories_show.values()[0] if len(self.subcategories_show) == 1 else self.subcategory_any else: subcategory = self.subcategories_movie.values()[0] if len(self.subcategories_movie) == 1 else self.subcategory_any if show: if pack: query = '%s S%02d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) querySplit = query.split() url = urlparse.urljoin(self.base_link, self.search_link) query = urllib.quote_plus(query) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 0 added = False timerTimeout = tools.Settings.getInteger('scraping.providers.timeout') timerEnd = timerTimeout - 8 timer = tools.Time(start = True) threads = [] self.tLock = threading.Lock() while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (self.category_video, subcategory, query, page) html = BeautifulSoup(client.request(urlNew)) page += 25 added = False htmlTables = html.find_all('table', class_ = 'table') if htmlTables: htmlTable = htmlTables[0] htmlTbody = htmlTable.find_all('tbody')[0] htmlRows = htmlTbody.find_all('tr', recursive = False) for i in range(len(htmlRows)): # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break htmlRow = htmlRows[i] # Name htmlInfo = htmlRows[i].find_all('a', href = True)[1] htmlName = htmlInfo.getText() # Category if subcategory is self.subcategory_any: htmlCategory = htmlRow.find_all('div', class_ = 'hidden')[0].getText() if show and len(self.subcategories_show) > 1: if htmlCategory not in self.subcategories_show.keys(): continue elif len(self.subcategories_show) > 1: if htmlCategory not in self.subcategories_movie.keys(): continue # Size htmlSize = re.sub('([mMkKgGtT]?)[oO]', '\\1b', htmlRow.find_all('td')[5].getText()) # Link htmlLink = self.base_link + self.download_link + str(htmlInfo.get('href').encode('utf-8')).split('/')[-1].split('-')[0] # Seeds htmlSeeds = int(htmlRow.find_all('td')[7].getText()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add self.tLock.acquire() self.tSources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) self.tLock.release() added = True # Hash if self.inspection: htmlHash = urllib.quote(str(htmlInfo.get('href').encode('utf-8')), ':/+') thread = threading.Thread(target = self._hash, args = (htmlHash, len(self.tSources) - 1)) threads.append(thread) thread.start() if not added: # Last page reached with a working torrent break # First filter out all non-related links before doing the hash lookup. if self.inspection: timerTimeout -= 2 while True: if timer.elapsed() > timerTimeout: break if not any([thread.is_alive() for thread in threads]): break tools.Time.sleep(0.3) try: self.tLock.release() except: pass return self.tSources except: tools.Logger.error() try: self.tLock.release() except: pass return self.tSources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) pack = None if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = '%s saison %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = title # Do not include year, otherwise there are few results. query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) type = self.type_shows if 'tvshowtitle' in data else self.type_movies url = urlparse.urljoin(self.base_link, self.search_link) % ( type, urllib.quote_plus(query)) html = BeautifulSoup(client.request(url)) htmlTable = html.find_all( 'table', class_='cust-table')[0].find_all('tbody', recursive=False)[0] htmlRows = htmlTable.find_all('tr', recursive=False) self.tLock = threading.Lock() self.tLinks = [None] * len(htmlRows) threads = [] for i in range(len(htmlRows)): urlTorrent = self.base_link + htmlRows[i].find_all( 'td', recursive=False)[0].find_all('a')[0]['href'] threads.append( threading.Thread(target=self._link, args=(urlTorrent, i))) [thread.start() for thread in threads] timerEnd = tools.Settings.getInteger( 'scraping.providers.timeout') - 8 timer = tools.Time(start=True) while timer.elapsed() < timerEnd and any( [thread.is_alive() for thread in threads]): tools.Time.sleep(0.5) self.tLock.acquire( ) # Just lock in case the threads are still running. for i in range(len(htmlRows)): # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive=False) # Name htmlName = htmlColumns[0].getText().strip() if not 'tvshowtitle' in data: htmlName = re.sub( r"^(.*?)(TRUE|TRUEFRENCH|FRENCH|VOSTFR|VO)(.*)([0-9]{4})$", r"\1 \4 \2\3", htmlName) # Link htmlLink = self.tLinks[i] # Size htmlSize = htmlColumns[1].getText().strip().lower().replace( ' mo', 'MB').replace(' go', 'GB').replace(' o', 'b') # Seeds try: htmlSeeds = int(htmlColumns[2].getText().strip()) except: htmlSeeds = None # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) # Ignore if meta.ignore(False): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) self.tLock.release() return sources except: tools.Logger.error() return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) querySplit = query.split() url = urlparse.urljoin(self.base_link, self.search_link) page = 0 # Pages start at 0 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (page, urllib.quote_plus(query)) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False htmlTable = html.find_all('div', id = 'ires')[0].find_all('ol', recursive = False)[0] htmlRows = htmlTable.find_all('li', recursive = False) # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): row1 = htmlRows[i].find_all('h3', class_ = 'r')[0] row2 = htmlRows[i].find_all('div', class_ = 'sti')[0] # Name htmlName = row1.find_all('a', class_ = 'tl', recursive = False)[0].getText().strip() # Link htmlHash = row1.find_all('a', class_ = 'tl', recursive = False)[0]['href'] if htmlHash.startswith('/'): htmlHash = htmlHash[1:] index = htmlHash.find('/') if index > 0: htmlHash = htmlHash[:index] if not tools.Hash.valid(htmlHash): continue htmlLink = network.Container(htmlHash).torrentMagnet(title = query) # Size htmlSize = row2.find_all('span', class_ = 'torrent-size')[0].getText().strip() # Seeds htmlSeeds = int(row2.find_all('span', class_ = 'seeders')[0].find_all('span', class_ = 'gac_b')[0].getText().strip()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Ignore Name # TorrentProject has a lot of season packs, foreign titles, and other torrents that should be excluded. If the name does not contain the exact search string, ignore the result. if not all(q in htmlName for q in querySplit): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(), 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'exact' in data and data['exact']: query = title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) category = self.category_shows if ('tvshowtitle' in data and not data['tvshowtitle'] == None and not data['tvshowtitle'] == '') else self.category_movies pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (category, urllib.quote_plus(query), page) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False # NB: Do not use "tbody class=results", since the table has inner div/style that breaks parsing. htmlRows = html.find_all('tr', class_ = 'result') # Do not search further down the tree (just the direct children), because that will also retrieve the header row. for i in range(len(htmlRows)): try: htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Name htmlName = htmlColumns[0].find_all('a')[0].getText().strip() # Size htmlSize = htmlColumns[1].getText().strip() # Link htmlLink = '' htmlLinks = htmlColumns[0].find_all('a') for j in range(len(htmlLinks)): link = htmlLinks[j]['href'] if link.startswith('magnet:'): htmlLink = link break # Seeds htmlSeeds = int(re.sub('[^0-9]', '', htmlColumns[4].getText().strip())) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) added = True except: pass if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): self.tSources = [] try: if url == None: raise Exception() ignoreContains = None data = self._decode(url) if 'exact' in data and data['exact']: query = title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = None year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = data['alternatives'] if 'alternatives' in data else None year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: # Search special episodes by name. All special episodes are added to season 0 by Trakt and TVDb. Hence, do not search by filename (eg: S02E00), since the season is not known. if (season == 0 or episode == 0) and ('title' in data and not data['title'] == None and not data['title'] == ''): title = '%s %s' % (data['tvshowtitle'], data['title']) # Change the title for metadata filtering. query = title ignoreContains = len(data['title']) / float(len(title)) # Increase the required ignore ration, since otherwise individual episodes and season packs are found as well. else: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) query = urllib.quote_plus(query) if not self._query(query): return sources url = urlparse.urljoin(self.base_link, self.search_link) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 # Page starts at 1, but incremented before first request. timerTimeout = tools.Settings.getInteger('scraping.providers.timeout') timerEnd = timerTimeout - 8 timer = tools.Time(start = True) threads = [] self.tLock = threading.Lock() while True: try: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break added = False pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break html = BeautifulSoup(client.request(url % (query, pageCounter))) htmlTable = html.find_all('table', class_ = 'results') htmlTable = htmlTable[len(htmlTable) - 1] htmlRows = htmlTable.find_all('tr') for i in range(1, len(htmlRows)): try: htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Use children and no further. # Name htmlName = htmlColumns[0].find_all('a')[0].getText() # Link htmlLink = urlparse.urljoin(self.base_link, htmlColumns[0].find_all('a')[0]['href']) # Size htmlSize = htmlColumns[1].getText() # Age htmlAge = htmlColumns[3].getText() htmlAge = int(convert.ConverterDuration(htmlAge).value(convert.ConverterDuration.UnitDay)) # Metadata meta = metadata.Metadata(name = htmlName, title = title, titles = titles, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, age = htmlAge) # Ignore meta.ignoreAdjust(contains = ignoreContains, length = 0.3) if meta.ignore(False): continue # Add self.tLock.acquire() self.tSources.append({'url' : None, 'debridonly' : False, 'direct' : False, 'source' : 'usenet', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) self.tLock.release() added = True # Link thread = threading.Thread(target = self._link, args = (htmlLink, len(self.tSources) - 1)) threads.append(thread) thread.start() except: pass if not added: break except: break # First filter out all non-related links before doing the hash lookup. timerTimeout -= 2 while True: if timer.elapsed() > timerTimeout: break if not any([thread.is_alive() for thread in threads]): break tools.Time.sleep(0.5) try: self.tLock.release() except: pass except: try: self.tLock.release() except: pass return [i for i in self.tSources if i['url']]
def sources(self, url, hostDict, hostprDict): sources = [] found = [] try: if url == None: raise Exception() if not (self.enabled and self.username and not self.username == '' and self.password and not self.password == ''): raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) # Login if self.enabled and self.username and not self.username == '' and self.password and not self.password == '': login = urlparse.urljoin(self.base_link, '/login') post = urllib.urlencode({'username': self.username, 'password': self.password, 'rememberme' : 'on', 'submit': 'Login'}) # Must have rememberme, otherwise cannot login (UsenetCrawler bug). cookie = client.request(login, post = post, output = 'cookie', close = False) response = client.request(login, post = post, cookie = cookie, output = 'extended') headers = {'User-Agent': response[3]['User-Agent'], 'Cookie': response[3]['Cookie']} else: cookie = None headers = None url = urlparse.urljoin(self.base_link, self.search_link) type = self.type_tvshows if 'tvshowtitle' in data else self.type_movies offset = 0 timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break urlNew = url % (urllib.quote_plus(query), type, offset) html = BeautifulSoup(client.request(urlNew, cookie = cookie)) offset += self.offset htmlTable = html.find_all('table', id = 'browsetable')[0] # Will fail if on last page and the table is not present. htmlRows = htmlTable.find_all('tr', recursive = False) # Use children and no further. for i in range(1, len(htmlRows)): # First row is the header. htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Use children and no further. htmlInfo = htmlColumns[0] # Name htmlName = htmlInfo.find_all('a', class_ = 'title')[0].getText() # Size htmlSize = htmlColumns[3].getText() indexEnd = htmlSize.find('<br') if indexEnd >= 0: htmlSize = htmlSize[: indexEnd] # Link htmlLink = self.base_link + htmlColumns[6].find_all('a')[0]['href'] index = htmlLink.rfind('/') if index > 0: htmlLink = htmlLink[:index] # Remove name at end that contains spaces if not headers == None: htmlLink += '|' + urllib.urlencode(headers) # Age htmlAge = htmlColumns[2]['title'] htmlAge = tools.Time.datetime(htmlAge, '%Y-%m-%d %H:%M:%S') htmlAge = datetime.datetime.today() - htmlAge htmlAge = htmlAge.days # Language htmlLanguage = htmlColumns[1].find_all('a')[0].getText() if 'Foreign >' in htmlLanguage: htmlLanguage = tools.Language.code(htmlLanguage[htmlLanguage.rfind('>') + 1:].strip()) else: htmlLanguage = self.language[0] # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, age = htmlAge) # Ignore if meta.ignore(False): continue # Ignore Duplicates htmlCategory = htmlColumns[1].find_all('a')[0].getText() htmlFiles = htmlColumns[4].find_all('a')[0].getText() size = meta.size() if isinstance(size, (float, int, long)): size = int(math.ceil(size / 1048576.0) * 1048576.0) # Sometimes the file size slightly varies. Round to the upper MB. htmlAge = int(math.ceil(htmlAge)) foundId = htmlName.lower() + '_' + str(htmlAge) + '_' + htmlCategory + '_' + htmlFiles + '_' + str(size) if foundId in found: continue found.append(foundId) # Add # Some NZBs have the wrong size (often a few KB) indicated on the site, but are in reaility bigger. Hence, do not show the size of NZBs below 20MB, but still add them. sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'usenet', 'memberonly' : True, 'language' : htmlLanguage, 'quality': meta.videoQuality(), 'info' : meta.information(sizeLimit = 20971520), 'file' : htmlName}) return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) pack = None if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = ['%s %d' % (title, season)] else: query = [ '%s S%02dE%02d' % (title, season, episode), '%s %02dx%02d' % (title, season, episode) ] else: query = ['%s %d' % (title, year)] query = [ re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', q) for q in query ] for q in query: url = urlparse.urljoin(self.base_link, self.search_link) % urllib.quote_plus(q) # Fix HTML closing tags. html = client.request(url, ignoreSsl=True) # SSL Certificate fails. html = re.sub('<span.*>\s*<\/span>\s*<td', '</td><td', html) html = BeautifulSoup(html) htmlRows = html.find_all('tr', class_=['odd', 'odd2']) for i in range(len(htmlRows)): try: htmlColumns = htmlRows[i].find_all('td', recursive=False) # Name # Name is abbriviated, use the name in the link instead. htmlName = htmlColumns[1].find_all('a')[0]['href'] htmlName = htmlName[htmlName.rfind('/') + 1:] htmlName = htmlName.replace('_', ' ') # Link htmlLink = htmlColumns[3].find_all('input')[0]['value'] htmlLink = network.Container(htmlLink).torrentMagnet( title=q, trackers=self.trackers) # Size htmlSize = htmlColumns[2].getText().strip() # Seeds try: htmlSeeds = int(htmlColumns[5].getText().strip()) except: htmlSeeds = None # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds) meta.mIgnoreLength = 8 # Relax this, otherwise too many links are filtered out (eg: Avatar 2009). # Ignore if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName, 'pack': pack }) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None query = '%s S%02dE%02d' % ( title, season, episode) if 'tvshowtitle' in data else '%s %d' % (title, year) query = urllib.quote_plus(query) # The returned website is different to the normal website. # Probably a mobile version. url = urlparse.urljoin(self.base_link, self.search_link) % query html = BeautifulSoup(client.request(url)) htmlRows = html.find_all('div', class_='yt-lockup-content') for htmlRow in htmlRows: htmlInfo = htmlRow.find_all('a')[0] # Name htmlName = htmlInfo.getText().strip() # Link htmlLink = urlparse.urljoin(self.base_link, htmlInfo['href']) # Duration htmlDuration = 0 try: htmlDurationItem = htmlRow.find_all( 'span')[0].getText().lower() indexStart = htmlDurationItem.find(':') if indexStart > 0: indexStart += 1 indexEnd = htmlDurationItem.find('.', indexStart) if indexEnd > 0: htmlDuration = htmlDurationItem[ indexStart:indexEnd].strip() htmlDuration = htmlDuration.split(':') if len(htmlDuration) == 3: htmlDuration = (int(htmlDuration[0]) * 3600) + (int(htmlDuration[1]) * 60) + int( htmlDuration[2]) else: htmlDuration = (int(htmlDuration[0]) * 60) + int(htmlDuration[1]) else: htmlDuration = 0 except: pass # Ignore trailers, etc. if any(s in htmlName.lower() for s in self.excludes): continue # Ignore less than 10 minutes. if htmlDuration < 600: continue # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, link=htmlLink) # Ignore if meta.ignore(False): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'youtube', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) added = True return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] type = None year = None season = None episode = None pack = False packCount = None else: type = 'tv' if 'tvshowtitle' in data else 'movie' title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 added = False timerEnd = tools.Settings.getInteger( 'scraping.providers.timeout') - 8 timer = tools.Time(start=True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (urllib.quote_plus(query), page) html = BeautifulSoup(client.request(urlNew)) page += 1 added = False htmlTable = html.find_all('table', class_='table')[0] htmlRows = htmlTable.find_all('td', class_='x-item') for i in range(0, len(htmlRows)): try: htmlRow = htmlRows[i] # Name htmlName = htmlRow.find_all( 'a', class_='title')[0]['title'].strip() # Size htmlSize = htmlRow.find_all( 'div', class_='tail')[0].getText().replace( '\n', '').replace('\r', '').replace(' ', ' ').strip() htmlSize = re.search('.*[sS]ize:(.*)[dD]ownloads.*', htmlSize, re.IGNORECASE) if htmlSize: htmlSize = htmlSize.group(1).strip() else: htmlSize = None # Link htmlLink = htmlRow.find_all( 'div', class_='tail')[0].find_all( 'a', class_='title')[0]['href'].strip() # Metadata meta = metadata.Metadata(name=htmlName, title=title, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=1) # Ignore if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) added = True except: pass if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) if 'exact' in data and data['exact']: query = title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) query = urllib.quote_plus(query) category = self.category_tvshows if ('tvshowtitle' in data and not data['tvshowtitle'] == None and not data['tvshowtitle'] == '') else self.category_movies pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 # Pages start at 1 added = False timerEnd = tools.Settings.getInteger('scraping.providers.timeout') - 8 timer = tools.Time(start = True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = (self.base_link + self.search_link) % (query, category, page) html = BeautifulSoup(client.request(urlNew)) htmlTable = html.find_all('div', class_ = 'content')[0].find_all('table', class_ = 'table-sm', recursive = False)[1] htmlRows = htmlTable.find_all('tr', recursive = False) page += 1 added = False for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Name htmlName = htmlColumns[0].getText().strip() # Size htmlSize = htmlColumns[1].getText().strip() # Link htmlLink = htmlRow.find_all('td', recursive = False)[0].find_all('a')[0]['href'].strip() htmlLink = re.search('\/torrent\/(.*)\/', htmlLink, re.IGNORECASE).group(1) htmlLink = (self.base_link + self.torrent_link) % htmlLink # Seeds htmlSeeds = int(htmlColumns[3].getText().strip()) # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, packCount = packCount, link = htmlLink, size = htmlSize, seeds = htmlSeeds) # Ignore if meta.ignore(True): continue # Add sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'torrent', 'language' : self.language[0], 'quality': meta.videoQuality(), 'metadata' : meta, 'file' : htmlName}) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] try: if url == None: raise Exception() ignoreContains = None data = self._decode(url) if 'exact' in data and data['exact']: query = title = data[ 'tvshowtitle'] if 'tvshowtitle' in data else data['title'] titles = None year = None season = None episode = None pack = False packCount = None else: title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] titles = data[ 'alternatives'] if 'alternatives' in data else None year = int( data['year'] ) if 'year' in data and not data['year'] == None else None season = int( data['season'] ) if 'season' in data and not data['season'] == None else None episode = int( data['episode']) if 'episode' in data and not data[ 'episode'] == None else None pack = data['pack'] if 'pack' in data else False packCount = data['packcount'] if 'packcount' in data else None if 'tvshowtitle' in data: # Search special episodes by name. All special episodes are added to season 0 by Trakt and TVDb. Hence, do not search by filename (eg: S02E00), since the season is not known. if (season == 0 or episode == 0) and ('title' in data and not data['title'] == None and not data['title'] == ''): title = '%s %s' % ( data['tvshowtitle'], data['title'] ) # Change the title for metadata filtering. query = title ignoreContains = len(data['title']) / float( len(title) ) # Increase the required ignore ration, since otherwise individual episodes and season packs are found as well. else: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) if not self._query(query): return sources category = self.category_shows if 'tvshowtitle' in data else self.category_movies url = urlparse.urljoin(self.base_link, self.search_link) pageLimit = tools.Settings.getInteger('scraping.providers.pages') pageCounter = 0 page = 1 added = False timerEnd = tools.Settings.getInteger( 'scraping.providers.timeout') - 8 timer = tools.Time(start=True) while True: # Stop searching 8 seconds before the provider timeout, otherwise might continue searching, not complete in time, and therefore not returning any links. if timer.elapsed() > timerEnd: break pageCounter += 1 if pageLimit > 0 and pageCounter > pageLimit: break urlNew = url % (page, urllib.quote_plus(query), category) # For some reason Zooqle returns 404 even though the response has a body. # This is probably a bug on Zooqle's server and the error should just be ignored. html = BeautifulSoup(client.request(urlNew, ignoreErrors=404)) page += 1 added = False htmlTable = html.find_all('table', class_='table-torrents')[0] htmlRows = htmlTable.find_all('tr', recursive=False) for i in range(1, len(htmlRows)): # First row is header. htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td') htmlInfo = htmlColumns[1] htmlMeta = htmlInfo.find_all('div', recursive=False)[0] # Name htmlName = htmlInfo.find_all( 'a', recursive=False)[0].getText().strip() # Size htmlSize = htmlColumns[3].getText() # Link htmlLink = '' htmlLinks = htmlColumns[2].find_all('a') for j in range(len(htmlLinks)): link = htmlLinks[j]['href'] if link.startswith('magnet:'): htmlLink = link break # Seeds htmlSeeds = htmlColumns[5].find_all( 'div', recursive=False)[0]['title'] indexStart = htmlSeeds.find(':') if indexStart > 0: indexStart += 1 indexEnd = htmlSeeds.find('|', indexStart) if indexEnd > 0: htmlSeeds = htmlSeeds[indexStart:indexEnd] else: htmlSeeds = htmlSeeds[indexStart:] htmlSeeds = int( htmlSeeds.replace(',', '').replace('.', '').strip()) else: htmlSeeds = None # Quality & 3D try: htmlQuality = htmlMeta.find_all( 'span', class_='hidden-xs')[0].getText().lower().strip() if 'ultra' in htmlQuality: htmlQuality = '4K' elif 'std' in htmlQuality: htmlQuality = 'SD' elif 'med' in htmlQuality or 'low' in htmlQuality: htmlQuality = 'CAM' htmlName += ' ' + htmlQuality except: pass # Audio try: htmlName += ' ' + htmlMeta.find_all( 'span', {'title': 'Audio format'})[0].getText() except: pass # Languages try: htmlLanguages = htmlMeta.find_all( 'span', {'title': 'Detected languages' })[0].getText().split(',') except: htmlLanguages = None # Metadata meta = metadata.Metadata(name=htmlName, title=title, titles=titles, year=year, season=season, episode=episode, pack=pack, packCount=packCount, link=htmlLink, size=htmlSize, seeds=htmlSeeds, languageAudio=htmlLanguages) # Ignore meta.ignoreAdjust(contains=ignoreContains) if meta.ignore(True): continue # Add sources.append({ 'url': htmlLink, 'debridonly': False, 'direct': False, 'source': 'torrent', 'language': self.language[0], 'quality': meta.videoQuality(), 'metadata': meta, 'file': htmlName }) added = True if not added: # Last page reached with a working torrent break return sources except: return sources
def sources(self, url, hostDict, hostprDict): sources = [] found = [] try: if url == None: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] year = int(data['year']) if 'year' in data and not data['year'] == None else None season = int(data['season']) if 'season' in data and not data['season'] == None else None episode = int(data['episode']) if 'episode' in data and not data['episode'] == None else None pack = data['pack'] if 'pack' in data else False if 'tvshowtitle' in data: if pack: query = '%s %d' % (title, season) else: query = '%s S%02dE%02d' % (title, season, episode) else: query = '%s %d' % (title, year) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = urlparse.urljoin(self.base_link, self.search_link) % (urllib.quote_plus(query)) html = BeautifulSoup(client.request(url)) htmlTable = html.find_all('tbody', id = 'spots')[0] # Fix some problems with the markup. htmlTable = str(htmlTable) htmlTable = htmlTable.replace('\'=""', '=""') # Dangling single quote. htmlTable = htmlTable.replace('<b>', '').replace('</b>', '') # There are bold tabgs wrapped arround some td, casuing BeautifulSoup to skip them. htmlTable = BeautifulSoup(htmlTable) htmlRows = htmlTable.find_all('tr') # Do not switch recursive off here, for some reason BeautifulSoup then detects nothing. Probabley because of markup fixing. for i in range(len(htmlRows)): htmlRow = htmlRows[i] htmlColumns = htmlRow.find_all('td', recursive = False) # Use children and no further. htmlInfo = htmlColumns[1] # Category htmlCategory = htmlColumns[0].find_all('a')[0].getText() htmlCategory = htmlCategory.replace('HD', ' HD') # Name htmlName = htmlInfo.find_all('a')[0].getText() htmlName += ' ' + htmlCategory # Size htmlSize = htmlColumns[6].getText() # Link htmlLink = htmlColumns[7].find_all('a')[0]['href'] # Age htmlAge = htmlColumns[5]['title'] index = htmlAge.find(',') if index >= 0: htmlAge = htmlAge[index + 1:] htmlAge = htmlAge.strip() htmlAge = tools.Time.datetime(htmlAge, '%d-%b-%Y (%H:%M)') htmlAge = datetime.datetime.today() - htmlAge htmlAge = htmlAge.days # Metadata meta = metadata.Metadata(name = htmlName, title = title, year = year, season = season, episode = episode, pack = pack, link = htmlLink, size = htmlSize, age = htmlAge) # Ignore if meta.ignore(False): continue # Ignore Duplicates htmlPoster = htmlColumns[4].find_all('a')[0].getText() size = meta.size() if isinstance(size, (float, int, long)): size = int(math.ceil(size / 1048576.0) * 1048576.0) # Sometimes the file size slightly varies. Round to the upper MB. htmlAge = int(math.ceil(htmlAge)) foundId = htmlName.lower() + '_' + str(htmlAge) + '_' + htmlCategory + '_' + htmlPoster + '_' + str(size) if foundId in found: continue found.append(foundId) # Add # Some NZBs have the wrong size (often a few KB) indicated on the site, but are in reaility bigger. Hence, do not show the size of NZBs below 20MB, but still add them. sources.append({'url' : htmlLink, 'debridonly' : False, 'direct' : False, 'source' : 'usenet', 'language' : self.language[0], 'quality': meta.videoQuality(), 'info' : meta.information(sizeLimit = 20971520), 'file' : htmlName}) return sources except: return sources