def parse_download_page(self, url): if 'newpct1.com' in url: log.verbose('Newpct1 URL: %s', url) url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/') else: log.verbose('Newpct URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) if 'newpct1.com' in url: torrent_id_prog = re.compile(r'descargar-torrent/(.+)/') torrent_ids = soup.findAll(href=torrent_id_prog) else: torrent_id_prog = re.compile("'(?:torrentID|id)'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids) == 0: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) if 'newpct1.com' in url: torrent_id = torrent_id_prog.search(torrent_ids[0]['href']).group(1) return 'http://www.newpct1.com/download/%s.torrent' % torrent_id else: torrent_id = torrent_id_prog.search(torrent_ids[0]).group(1) return 'http://www.newpct.com/torrents/{:0>6}.torrent'.format(torrent_id)
def url_rewrite(self, task, entry): soup = self._get_soup(task, entry['url']) link_re = re.compile('rarefile\.net.*\.rar$') # grab links from the main entry: blog_entry = soup.find('div', class_="entry") num_links = 0 link_list = None for paragraph in blog_entry.find_all('p'): links = paragraph.find_all('a', href=link_re) if len(links) > num_links: link_list = links num_links = len(links) if 'urls' in entry: urls = list(entry['urls']) else: urls = [] if link_list is not None: for link in link_list: urls.append(normalize_unicode(link['href'])) else: raise UrlRewritingError('No useable links found at %s' % entry['url']) num_links = len(urls) log.verbose('Found %d links at %s.', num_links, entry['url']) if num_links: entry['urls'] = urls entry['url'] = urls[0] else: raise UrlRewritingError('No useable links found at %s' % entry['url'])
def url_rewrite(self, task, entry): if 'url' not in entry: log.error("Didn't actually get a URL...") else: url = entry['url'] log.debug("Got the URL: %s" % entry['url']) rawdata = "" try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] response = opener.open(url) except Exception as e: raise UrlRewritingError("Connection Error for %s : %s" % (url, e)) rawdata = response.read() match = re.search( r"<a href=\"/torrents/download/\?id=(\d*?)\">.*\.torrent</a>", rawdata) if match: torrent_id = match.group(1) log.debug("Got the Torrent ID: %s" % torrent_id) entry[ 'url'] = 'https://www.t411.al/torrents/download/?id=' + torrent_id if 'download_auth' in entry: auth_handler = t411Auth(*entry['download_auth']) entry['download_auth'] = auth_handler else: raise UrlRewritingError("Cannot find torrent ID")
def url_rewrite(self, task, entry): url = entry['url'] page = None for (scheme, netloc) in EZTV_MIRRORS: try: _, _, path, params, query, fragment = urlparse(url) url = urlunparse((scheme, netloc, path, params, query, fragment)) page = task.requests.get(url).content except RequestException as e: log.debug('Eztv mirror `%s` seems to be down', url) continue break if not page: raise UrlRewritingError('No mirrors found for url %s' % entry['url']) log.debug('Eztv mirror `%s` chosen', url) try: soup = get_soup(page) mirrors = soup.find_all('a', attrs={'class': re.compile(r'download_\d')}) except Exception as e: raise UrlRewritingError(e) log.debug('%d torrent mirrors found', len(mirrors)) if not mirrors: raise UrlRewritingError('Unable to locate download link from url %s' % url) entry['urls'] = [m.get('href') for m in mirrors] entry['url'] = mirrors[0].get('href')
def parse_downloads(self, series_url, search_title): page = requests.get(series_url).content try: soup = get_soup(page) except Exception as e: raise UrlRewritingError(e) urls = [] # find all titles episode_titles = self.find_all_titles(search_title) if not episode_titles: raise UrlRewritingError('Unable to find episode') for ep_title in episode_titles: # find matching download episode_title = soup.find('strong', text=re.compile(ep_title, re.I)) if not episode_title: continue # find download container episode = episode_title.parent if not episode: continue # find episode language episode_lang = episode.find_previous( 'strong', text=re.compile('Sprache')).next_sibling if not episode_lang: log.warning('No language found for: %s', series_url) continue # filter language if not self.check_language(episode_lang): log.warning('languages not matching: %s <> %s', self.config['language'], episode_lang) continue # find download links links = episode.find_all('a') if not links: log.warning('No links found for: %s', series_url) continue for link in links: if not link.has_attr('href'): continue url = link['href'] pattern = 'http:\/\/download\.serienjunkies\.org.*%s_.*\.html' % self.config[ 'hoster'] if re.match(pattern, url) or self.config['hoster'] == 'all': urls.append(url) else: continue return urls
def _get_soup(self, task, url): try: page = task.requests.get(url) except RequestException as e: raise UrlRewritingError(str(e)) try: return get_soup(page.text) except Exception as e: raise UrlRewritingError(str(e))
def parse_download_page(self, url): if 'newpct1.com' in url: log.verbose('Newpct1 URL: %s', url) else: log.verbose('Newpct URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) torrent_id = None if 'newpct1.com' in url: url_format = NEWPCT1_TORRENT_FORMAT torrent_id_prog = re.compile( 'function openTorrent.*\n.*\{.*(\n.*)+window\.location\.href =\s*\".*\/(\d+.+)\";' ) torrent_ids = soup.findAll(text=torrent_id_prog) log.debug('searching openTorrent script') if torrent_ids: match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(2) else: url_format = NEWPCT_TORRENT_FORMAT torrent_id_prog = re.compile( "(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)" "'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids): match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(1) if not torrent_id: torrent_id_prog = re.compile( 'function openTorrent.*\n.*\{.*(\n.*)+window\.location\.href =\s*\".*\/(\d+).*\";' ) torrent_ids = soup.findAll(text=torrent_id_prog) log.debug('torrent ID not found, searching openTorrent script') if torrent_ids: match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(2) if not torrent_id: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) return url_format.format(torrent_id)
def parse_download_page(self, url, requests): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) down_link = soup.find('a', attrs={'href': re.compile(".+mp4")}) if not down_link: raise UrlRewritingError('Unable to locate download link from url %s' % url) return down_link.get('href')
def parse_download_page(self, url, requests): txheaders = { 'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) tag_a = soup.find('a', attrs={'class': 'download_link'}) if not tag_a: raise UrlRewritingError( 'Unable to locate download link from url %s' % url) torrent_url = 'https://bakabt.me/' + tag_a.get('href') return torrent_url
def parse_download_page(self, url, requests): page = requests.get(url).content try: soup = get_soup(page) tag_div = soup.find('div', attrs={'class': 'download'}) if not tag_div: raise UrlRewritingError('Unable to locate download link from url %s' % url) tag_a = tag_div.find('a') torrent_url = tag_a.get('href') # URL is sometimes missing the schema if torrent_url.startswith('//'): torrent_url = 'http:' + torrent_url return torrent_url except Exception as e: raise UrlRewritingError(e)
def parse_download_page(self, url, requests): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} page = requests.get(url, headers=txheaders) match = re.findall('mirror[0-9]_openload\',\'(.*?)\'\)',page.text) if len(match) == 0: raise UrlRewritingError('Unable to locate Openload hash from url %s' % url) urlhash = match[0] data = base64.b64decode(urlhash) data = data.split("-") down_link = "" for char in data: down_link += str(unichr(int(char))) if not down_link: raise UrlRewritingError('Unable to locate download link from url %s' % url) return down_link
def url_from_page(self, url): """Parses torrent url from newtorrents download page""" try: page = requests.get(url) data = page.text except Exception: raise UrlRewritingError('URLerror when retrieving page') p = re.compile("copy\(\'(.*)\'\)", re.IGNORECASE) f = p.search(data) if not f: # the link in which plugin relies is missing! raise UrlRewritingError( 'Failed to get url from download page. Plugin may need a update.' ) else: return f.group(1)
def url_rewrite(self, task, entry): url = entry['url'] if (url.startswith('http://www.newtorrents.info/?q=') or url.startswith('http://www.newtorrents.info/search')): results = self.entries_from_search(entry['title'], url=url) if not results: raise UrlRewritingError("No matches for %s" % entry['title']) url = results[0]['url'] else: url = self.url_from_page(url) if url: entry['url'] = url self.resolved.append(url) else: raise UrlRewritingError('Bug in newtorrents urlrewriter')
def url_rewrite(self, task, entry): log.debug('Requesting %s' % entry['url']) page = requests.get(entry['url']) soup = get_soup(page.text) for link in soup.findAll('a', attrs={'href': re.compile(r'^/url')}): # Extract correct url from google internal link href = 'http://google.com' + link['href'] args = parse_qs(urlparse(href).query) href = args['q'][0] # import IPython; IPython.embed() # import sys # sys.exit(1) # href = link['href'].lstrip('/url?q=').split('&')[0] # Test if entry with this url would be recognized by some urlrewriter log.trace('Checking if %s is known by some rewriter' % href) fake_entry = {'title': entry['title'], 'url': href} urlrewriting = plugin.get_plugin_by_name('urlrewriting') if urlrewriting['instance'].url_rewritable(task, fake_entry): log.debug('--> rewriting %s (known url pattern)' % href) entry['url'] = href return else: log.debug('<-- ignoring %s (unknown url pattern)' % href) raise UrlRewritingError('Unable to resolve')
def url_rewrite(self, task, entry): soup = self._get_soup(task, entry['url']) # grab link from filehosters_re link_elements = [] log.debug('Searching %s for a tags where the text matches one of: %s', entry['url'], str(self.config.get('filehosters_re'))) regexps = self.config.get('filehosters_re', []) if self.config.get('parse'): link_elements = soup.find_all('div', class_=re.compile("mag_details")) log.debug( 'filehosters_re parsing enabled: found %d filehosters_re.', len(link_elements)) log.debug('Original urls: %s', str(entry['urls'])) if 'urls' in entry: urls = list(entry['urls']) log.debug('Original urls: %s', str(entry['urls'])) else: urls = [] log.debug('link_elements parsing enabled: found %d link_elements.', len(link_elements)) if link_elements and not regexps: log.warn('There are not in filehosters_re.') for target in link_elements: links = target.find_all('a') for link in links: if re.search('novafile.com', link['href']): urls.append(link['href']) # filter urls: filtered_urls = [] for i, url in enumerate(urls): urls[i] = normalize_unicode(url) for regexp in regexps: if re.search(regexp, urls[i]): filtered_urls.append(urls[i]) log.debug('Url: "%s" matched filehoster filter: %s', urls[i], regexp) break else: if regexps: log.debug( 'Url: "%s" was discarded because it does not match any of the given filehoster filters: %s', urls[i], str(regexps)) if regexps: log.debug('Using filehosters_re filters: %s', str(regexps)) urls = filtered_urls else: log.debug( 'No filehoster filters configured, using all found links.') num_links = len(urls) log.verbose('Found %d links at %s.', num_links, entry['url']) if num_links: entry['urls'] = urls entry['url'] = urls[0] else: raise UrlRewritingError('No useable links found at %s' % entry['url'])
def url_rewrite(self, task, entry): try: page = task.requests.get(entry['url']) except RequestException as e: raise UrlRewritingError(str(e)) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(str(e)) link_elements = soup.find_all('pre', class_='links') if 'urls' in entry: urls = list(entry['urls']) else: urls = [] for element in link_elements: urls.extend(element.text.splitlines()) regexps = self.config.get('filehosters_re', []) filtered_urls = [] for i, url in enumerate(urls): urls[i] = normalize_unicode(url) for regexp in regexps: if re.search(regexp, urls[i]): filtered_urls.append(urls[i]) log.debug('Url: "%s" matched filehoster filter: %s', urls[i], regexp) break else: if regexps: log.debug( 'Url: "%s" does not match any of the given filehoster filters: %s', urls[i], str(regexps)) if regexps: log.debug('Using filehosters_re filters: %s', str(regexps)) urls = filtered_urls else: log.debug( 'No filehoster filters configured, using all found links.') num_links = len(urls) log.verbose('Found %d links at %s.', num_links, entry['url']) if num_links: entry['urls'] = urls entry['url'] = urls[0] else: raise UrlRewritingError('No useable links found at %s' % entry['url'])
def parse_download_page(self, url, requests): txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} try: page = requests.get(url, headers=txheaders) except requests.exceptions.RequestException as e: msg = 'Cannot open "%s" : %s'% (url, str(e)) log.error(msg) raise UrlRewritingError(msg) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(str(e)) down_link = soup.find('a', attrs={'href': re.compile("down\.php\?.*")}) if not down_link: raise UrlRewritingError('Unable to locate download link from url "%s"' % url) return 'http://bt.hliang.com/' + down_link.get('href')
def url_rewrite(self, task, entry): if 'url' not in entry: log.error('Didn\'t actually get a URL...') else: log.debug('Got the URL: %s', entry['url']) if entry['url'].startswith('https://www.torrentday.com/browse'): # use search results = self.search(task, entry) if not results: raise UrlRewritingError('No search results found') entry['url'] = results[0]['url']
def parse_download_page(self, page_url, requests): page = requests.get(page_url) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) tag_a = soup.find("a", {"class": "dl_link"}) if not tag_a: if soup.findAll(text="Connexion ?"): raise UrlRewritingError('You are not logged in,\ check if your cookie for\ authentication is up to date') else: raise UrlRewritingError('You have reached your download\ limit per 24hours, so I cannot\ get the torrent') torrent_url = ("http://www.frenchtorrentdb.com" + tag_a.get('href') + "&js=1") log.debug('TORRENT URL is : %s' % torrent_url) return torrent_url
def parse_download_page(self, url, requests): txheaders = { 'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' } page = requests.get(url, headers=txheaders) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) vk_soup = soup.find( 'a', attrs={'href': re.compile(r'https:.*vk.com.*no_preview=1')}) if not vk_soup: raise UrlRewritingError( 'Unable to locate download link from url %s' % url) vk_link = vk_soup.get('href') #txheaders = {'Accept': 'text/html'} page = requests.get(vk_link) if page.status_code != 200: raise UrlRewritingError('File does not exist in VK') return page.url
def parse_download_page(self, url): try: page = requests.get(url).content soup = get_soup(page, 'html.parser') download_link = soup.findAll( href=re.compile('redirect|redirectlink')) download_href = download_link[0]['href'] return download_href except Exception: raise UrlRewritingError('Unable to locate torrent from url %s' % url)
def parse_download_page(self, url, task): log.verbose('Descargas2020 URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) torrent_id = None url_format = DESCARGAS2020_TORRENT_FORMAT torrent_id_prog = re.compile( "(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)" "'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if torrent_ids: match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(1) if not torrent_id: log.debug('torrent ID not found, searching openTorrent script') torrent_id_prog = re.compile( 'function openTorrent.*\n.*\{.*(\n.*)+window\.location\.href =\s*\"(.*\/\d+_-.*[^\/])\/?\";' ) torrent_ids = soup.findAll(text=torrent_id_prog) if torrent_ids: match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(2) return torrent_id.replace('descargar-torrent', 'download') + '.torrent' if not torrent_id: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) return url_format.format(torrent_id)
def parse_download_page(self, url): if 'newpct1.com' in url: log.verbose('Newpct1 URL: %s', url) url = url.replace('newpct1.com/', 'newpct1.com/descarga-torrent/') else: log.verbose('Newpct URL: %s', url) try: page = requests.get(url) except requests.exceptions.RequestException as e: raise UrlRewritingError(e) try: soup = get_soup(page.text) except Exception as e: raise UrlRewritingError(e) torrent_id = None if 'newpct1.com' in url: url_format = NEWPCT1_TORRENT_FORMAT torrent_id_prog = re.compile(r'descargar-torrent/(.+)/') match = torrent_id_prog.search(soup.text) if match: torrent_id = match.group(1) else: url_format = NEWPCT_TORRENT_FORMAT torrent_id_prog = re.compile( "(?:parametros\s*=\s*\n?)\s*{\s*\n(?:\s*'\w+'\s*:.*\n)+\s*'(?:torrentID|id)" "'\s*:\s*'(\d+)'") torrent_ids = soup.findAll(text=torrent_id_prog) if len(torrent_ids): match = torrent_id_prog.search(torrent_ids[0]) if match: torrent_id = match.group(1) if not torrent_id: raise UrlRewritingError('Unable to locate torrent ID from url %s' % url) return url_format.format(torrent_id)
def url_rewrite(self, task, entry): if 'url' not in entry: log.error("Didn't actually get a URL...") else: log.debug("Got the URL: %s" % entry['url']) if entry['url'].startswith('https://www.torrentleech.org/torrents/browse/list/query/'): # use search results = self.search(task, entry) if not results: raise UrlRewritingError("No search results found") # TODO: Search doesn't enforce close match to title, be more picky entry['url'] = results[0]['url']
def url_rewrite(self, task, entry): try: # need to fake user agent txheaders = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} page = task.requests.get(entry['url'], headers=txheaders) soup = get_soup(page.text) results = soup.find_all('a', attrs={'class': 'l'}) if not results: raise UrlRewritingError('No results') for res in results: url = res.get('href') url = url.replace('/interstitial?url=', '') # generate match regexp from google search result title regexp = '.*'.join([x.contents[0] for x in res.find_all('em')]) if re.match(regexp, entry['title']): log.debug('resolved, found with %s' % regexp) entry['url'] = url return raise UrlRewritingError('Unable to resolve') except Exception as e: raise UrlRewritingError(e)
def url_rewrite(self, task, entry): if 'url' not in entry: log.error("Didn't actually get a URL...") else: log.debug("Got the URL: %s" % entry['url']) if entry['url'].startswith(BASE_URL + '/t?'): # use search results = self.search(task, entry) if not results: raise UrlRewritingError("No search results found") # TODO: Search doesn't enforce close match to title, be more picky entry['url'] = results[0]['url']
def url_rewrite(self, task, entry): for name, config in self.resolves.get(task.name, {}).items(): regexp = config['regexp_compiled'] format = config['format'] if regexp.search(entry['url']): log.debug('Regexp resolving %s with %s' % (entry['url'], name)) # run the regexp entry['url'] = regexp.sub(format, entry['url']) if regexp.match(entry['url']): entry.fail('urlrewriting') raise UrlRewritingError('Regexp %s result should NOT continue to match!' % name) return
def url_rewrite(self, task, entry): soup = self._get_soup(task, entry['url']) # grab links from the main post: # Can't work perfectly filescdn\.com.*/*$ # Can't work perfectly suprafiles\.net.*/*$ link_re = re.compile( 'dailyuploads\.net.*/*$|dropupload\.com.*/*$|cloudyfiles\.com.*/*$|upload4earn\.com.*/*$' ) num_links = 0 link_list = None blog_entry = soup.find('div', class_="box-inner-block") for paragraph in blog_entry.find_all('p'): links = paragraph.find_all('a', href=link_re) if len(links) > num_links: link_list = links num_links = len(links) if 'urls' in entry: urls = list(entry['urls']) else: urls = [] if link_list is not None: for link in link_list: urls.append(normalize_unicode(link['href'])) else: raise UrlRewritingError('No useable links found at %s' % entry['url']) num_links = len(urls) log.verbose('Found %d links at %s.', num_links, entry['url']) if num_links: entry['urls'] = urls entry['url'] = urls[0] else: raise UrlRewritingError('No useable links found at %s' % entry['url'])
def url_rewrite(self, task, entry): if 'url' not in entry: log.error("Didn't actually get a URL...") else: log.debug("Got the URL: %s" % entry['url']) if URL_SEARCH.match(entry['url']): # use search results = self.search(task, entry) if not results: raise UrlRewritingError("No search results found") # TODO: Close matching was taken out of search methods, this may need to be fixed to be more picky entry['url'] = results[0]['url'] else: # parse download page entry['url'] = self.parse_download_page(entry['url'])
def url_rewrite(self, task, entry): """ Gets the download information for 1337x result """ url = entry['url'] log.info('1337x rewriting download url: %s' % url) try: page = task.requests.get(url) log.debug('requesting: %s', page.url) except RequestException as e: log.error('1337x request failed: %s', e) raise UrlRewritingError('1337x request failed: %s', e) soup = get_soup(page.content) magnet_url = str(soup.find('a', id='magnetdl').get('href')).lower() torrent_url = str(soup.find('a', id='torrentdl').get('href')).lower() entry['url'] = torrent_url entry.setdefault('urls', []).append(torrent_url) entry['urls'].append(magnet_url)