def find_by_class(self, html_data, claas_name): soup = BeautifulSoup(html_data, 'html.parser') found_needed_data = soup.find("div", class_=claas_name) if found_needed_data is not None: return str(found_needed_data) else: return ""
def find_by_id(self, html_data, given_id): """ This function get html text and return the content of specific class id that is given as parameter :param html_data: html string :param given_id: string :return: string of the content by id or empty string if id not found in the html """ soup = BeautifulSoup(html_data, 'html.parser') found_needed_data = soup.find(id=given_id) print(found_needed_data) if found_needed_data is not None: return str(found_needed_data) else: return ""
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result, 'html5lib') # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.find_all('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.find_all('td') if len(td) > 2: try: new_soup = BeautifulSoup(str(td[1]), 'html5lib') link = new_soup.find("a") magnet = link.get("href") title = link.text size = td[1].text.split(', Size ')[1].split('iB')[0] size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def _doSearch(self, search_string, show=None, max_age=0): params = {"q":search_string, "m": "n", "max": 400, "minsize": 100, "adv_sort": "date", "adv_col": "on", "adv_nfo": "on", "adv_age": sickbeard.USENET_RETENTION} # if max_age is set, use it, don't allow it to be missing if max_age or not params['adv_age']: params['adv_age'] = max_age searchURL = self.urls["search"] % urllib.urlencode(params) logger.log(u"Search url: " + searchURL) data = self.getURL(searchURL) if not data: logger.log(u"No data returned from " + searchURL, logger.ERROR) return [] res_items = [] try: html = BeautifulSoup(data) main_table = html.find('table', attrs={'id':'r2'}) if not main_table: return [] items = main_table.find_all('tr') for row in items: title = row.find('span', attrs={'class':'s'}) if not title: continue nzb_id = row.find('input', attrs={'type':'checkbox'})['name'] info = row.find('span', attrs={'class':'d'}) def extra_check(item): parts = re.search('available:.(?P<parts>\d+)./.(?P<total>\d+)', info.text) total = tryInt(parts.group('total')) parts = tryInt(parts.group('parts')) if (total / parts) < 0.95 or ((total / parts) >= 0.95 and not ('par2' in info.text.lower() or 'pa3' in info.text.lower())): logger.log('Wrong: \'%s\', not complete: %s out of %s' % (item['name'], parts, total), logger.WARNING) return False if 'requires password' in info.text.lower(): logger.log('Wrong: \'%s\', passworded' % (item['name']), logger.WARNING) return False return True res_items.append({ 'id': nzb_id, 'title': title.text, 'url': self.urls['download'] % nzb_id, 'extra_check': extra_check }) except: logger.log('Failed to parse HTML response from BinSearch: %s' % traceback.format_exc(), logger.ERROR) results = [] for curItem in res_items: (title, url) = self._get_title_and_url(curItem) if title and url and curItem['extra_check']: logger.log(u"Adding item from BinSearch to results: " + title, logger.DEBUG) results.append(curItem) else: logger.log(u"The HTML returned from the " + self.name + " incomplete, this result is unusable", logger.DEBUG) return results
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result, 'html5lib') # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.find_all('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.find_all('td') if len(td) > 2: try: new_soup = BeautifulSoup(str(td[1]), 'html5lib') link = new_soup.find("a") magnet = link.get("href") title = link.text size = td[1].text.split(', Size ')[1].split('iB')[0] size = size.replace(' ', '') size = size_in_bytes(size) try: seeders = int(td[2].text.replace(',', '')) except ValueError: seeders = 0 if minimumseeders < seeders: # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn('Maximum results page search reached, still more results available') next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def GEN(book=None, prov=None, test=False): errmsg = '' provider = "libgen.io" if prov is None: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] sterm = makeUnicode(book['searchterm']) page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True elif '111' in result: # looks like libgen has ip based access limits logger.error( 'Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result, 'html5lib') try: table = soup.find_all('table')[2] # un-named table if table: rows = table.find_all('tr') except IndexError: # no results table in result page rows = [] if 'search.php' in search and len(rows) > 1: rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.find_all('td') if 'index.php' in search and len(td) > 3: try: author = formatAuthorName(td[0].text) title = td[2].text newsoup = BeautifulSoup(str(td[4]), 'html5lib') data = newsoup.find('a') link = data.get('href') extn = data.text.split('(')[0] size = data.text.split('(')[1].split(')')[0] size = size.upper() except IndexError as e: logger.debug( 'Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: try: author = formatAuthorName(td[1].text) title = td[2].text size = td[7].text.upper() extn = td[8].text newsoup = BeautifulSoup(str(td[2]), 'html5lib') link = newsoup.get('href') except IndexError as e: logger.debug( 'Error parsing libgen search.php results; %s' % str(e)) if not size: size = 0 else: try: mult = 1 if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if not link.startswith('http'): if "/ads.php?" in link: url = url_fix(host + link) else: url = url_fix(host + "/ads.php?" + link) else: url = redirect_url(host, link) bookresult, success = fetchURL(url) if not success: # may return 404 if no results, not really an error if '404' in bookresult: logger.debug( "No results found from %s for %s" % (provider, sterm)) else: logger.debug(url) logger.debug( 'Error fetching link data from %s: %s' % (provider, bookresult)) errmsg = bookresult bookresult = False if bookresult: url = None try: new_soup = BeautifulSoup( bookresult, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output: if output.startswith( 'http' ) and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split( '/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split( '/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.error( '%s parsing bookresult for %s: %s' % (type(e).__name__, link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def GEN(book=None, prov=None, test=False): errmsg = '' provider = "libgen.io" if not prov: prov = 'GEN' host = lazylibrarian.CONFIG[prov + '_HOST'] if not host.startswith('http'): host = 'http://' + host search = lazylibrarian.CONFIG[prov + '_SEARCH'] if not search or not search.endswith('.php'): search = 'search.php' if 'index.php' not in search and 'search.php' not in search: search = 'search.php' if search[0] == '/': search = search[1:] sterm = makeUnicode(book['searchterm']) page = 1 results = [] next_page = True while next_page: if 'index.php' in search: params = { "s": book['searchterm'], "f_lang": "All", "f_columns": 0, "f_ext": "All" } else: params = { "view": "simple", "open": 0, "phrase": 0, "column": "def", "res": 100, "req": book['searchterm'] } if page > 1: params['page'] = page providerurl = url_fix(host + "/%s" % search) searchURL = providerurl + "?%s" % urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True elif '111' in result: # looks like libgen has ip based access limits logger.error('Access forbidden. Please wait a while before trying %s again.' % provider) errmsg = result else: logger.debug(searchURL) logger.debug('Error fetching page data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) try: soup = BeautifulSoup(result, 'html5lib') rows = [] try: table = soup.find_all('table', rules='rows')[-1] # the last table with rules=rows if table: rows = table.find_all('tr') except IndexError: # no results table in result page rows = [] if len(rows) > 1: # skip table headers rows = rows[1:] for row in rows: author = '' title = '' size = '' extn = '' link = '' td = row.find_all('td') if 'index.php' in search and len(td) > 3: # Foreign fiction try: author = formatAuthorName(td[0].text) title = td[2].text newsoup = BeautifulSoup(str(td[4]), 'html5lib') data = newsoup.find('a') if data: link = data.get('href') extn = td[4].text.split('(')[0].strip() size = td[4].text.split('(')[1].split(')')[0] size = size.upper() except IndexError as e: logger.debug('Error parsing libgen index.php results: %s' % str(e)) elif 'search.php' in search and len(td) > 8: # Non-fiction try: author = formatAuthorName(td[1].text) title = td[2].text size = td[7].text.upper() extn = td[8].text link = '' newsoup = BeautifulSoup(str(td[2]), 'html5lib') for res in newsoup.find_all('a'): output = res.get('href') if 'md5' in output: link = output break except IndexError as e: logger.debug('Error parsing libgen search.php results; %s' % str(e)) size = size_in_bytes(size) if link and title: if author: title = author.strip() + ' ' + title.strip() if extn: title = title + '.' + extn if link.startswith('http'): url = redirect_url(host, link) else: if "/index.php?" in link: link = 'md5' + link.split('md5')[1] if "/ads.php?" in link: url = url_fix(host + "/" + link) else: url = url_fix(host + "/ads.php?" + link) bookresult, success = fetchURL(url) if not success: logger.debug('Error fetching link data from %s: %s' % (provider, bookresult)) logger.debug(url) url = None else: url = None try: new_soup = BeautifulSoup(bookresult, 'html5lib') for link in new_soup.find_all('a'): output = link.get('href') if output: if output.startswith('http') and '/get.php' in output: url = output break elif '/get.php' in output: url = '/get.php' + output.split('/get.php')[1] break elif '/download/book' in output: url = '/download/book' + output.split('/download/book')[1] break if url and not url.startswith('http'): url = url_fix(host + url) else: url = redirect_url(host, url) except Exception as e: logger.error('%s parsing bookresult for %s: %s' % (type(e).__name__, link, str(e))) url = None if url: results.append({ 'bookid': book['bookid'], 'tor_prov': provider + '/' + search, 'tor_title': title, 'tor_url': url, 'tor_size': str(size), 'tor_type': 'direct', 'priority': lazylibrarian.CONFIG[prov + '_DLPRIORITY'] }) logger.debug('Found %s, Size %s' % (title, size)) next_page = True except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn('Maximum results page search reached, still more results available') next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
def _doSearch(self, search_params, show=None): results = [] items = {'Season': [], 'Episode': [], 'RSS': []} for mode in search_params.keys(): for search_string in search_params[mode]: if mode == 'RSS': searchURL = self.url + 'index.php?page=torrents&active=1&category=%s' %(';'.join(self.categories[mode])) logger.log(u"PublicHD cache update URL: "+ searchURL, logger.DEBUG) else: searchURL = self.searchurl %(urllib.quote(unidecode(search_string)), ';'.join(self.categories[mode])) logger.log(u"Search string: " + searchURL, logger.DEBUG) html = self.getURL(searchURL) if not html: continue #remove unneccecary <option> lines which are slowing down BeautifulSoup optreg = re.compile( r'<option.*</option>' ) html = os.linesep.join([s for s in html.splitlines() if not optreg.search(s)]) try: soup = BeautifulSoup(html, features=["html5lib", "permissive"]) torrent_table = soup.find('table', attrs = {'id' : 'torrbg'}) torrent_rows = torrent_table.find_all('tr') if torrent_table else [] #Continue only if one Release is found if len(torrent_rows)<2: logger.log(u"The Data returned from " + self.name + " do not contains any torrent", logger.DEBUG) continue for tr in torrent_rows[1:]: try: link = self.url + tr.find(href=re.compile('page=torrent-details'))['href'] title = tr.find(lambda x: x.has_attr('title')).text.replace('_','.') url = tr.find(href=re.compile('magnet+'))['href'] seeders = int(tr.find_all('td', {'class': 'header'})[4].text) leechers = int(tr.find_all('td', {'class': 'header'})[5].text) except (AttributeError, TypeError): continue if mode != 'RSS' and seeders == 0: continue if not title or not url: continue item = title, url, link, seeders, leechers items[mode].append(item) except Exception, e: logger.log(u"Failed to parsing " + self.name + " Traceback: " + traceback.format_exc(), logger.ERROR) #For each search mode sort all the items by seeders items[mode].sort(key=lambda tup: tup[3], reverse=True) results += items[mode]