def last_entry(self): r = retry_on_fail(requests.get, self.url) soup = BeautifulSoup(r.text, 'lxml') link = soup.find('tr', class_='tlistrow').find('td', class_='tlistname').a['href'] return int(re.search('tid=([0-9]*)', link).group(1))
def baka_list(self): page_index = {} page = self.url + 'browse.php?page=' + self.pagen self.last_category = '' print('> Index page', self.pagen, 'of', self.pagentotal) r = retry_on_fail(requests.get, page) setattr(r, 'encoding', 'utf-8') self.page = BeautifulSoup(r.text, 'lxml') table = self.page.find('table', class_='torrents').find('tbody').find_all('tr') for tds in table: tdx = 0 for td in tds.find_all('td'): append = False tdx += 1 if len(tds) == 5: append = True if tdx == 1: self.td1 = td if tdx == 2: self.td2 = td elif tdx == 3: self.td3 = td elif tdx == 4: self.td4 = td elif tdx == 5: self.td5 = td elif len(tds) == 4 and 'Alternative versions' not in tds.text: append = True if tdx == 1: self.td2 = td if tdx == 2: self.td3 = td elif tdx == 3: self.td4 = td elif tdx == 4: self.td5 = td if append == True: if len(tds) == 5 and tdx == 5 or len( tds) == 4 and tdx == 4: baka_id = self.baka_url_id page_index[baka_id] = {} page_index[baka_id]['baka_url_id'] = self.baka_url_id page_index[baka_id]['baka_url'] = self.baka_url if len(tds) == 5: page_index[baka_id]['category'] = self.category elif len(tds) == 4: page_index[baka_id][ 'category'] = self.last_category page_index[baka_id]['title_orig'] = self.title_orig page_index[baka_id]['title'] = self.title page_index[baka_id]['resolution'] = self.resolution page_index[baka_id]['sb'] = str(self.sb) page_index[baka_id]['cb'] = str(self.cb) page_index[baka_id]['tags'] = str(self.tags) page_index[baka_id]['added'] = self.added page_index[baka_id]['size'] = self.size page_index[baka_id]['sld'] = str(self.sld) return page_index
def __init__(self, baka_url, baka_title): self.baka_url = baka_url self.title = baka_title self.exists = True r = retry_on_fail(requests.get, self.baka_url) setattr(r, 'encoding', 'utf-8') self.page = BeautifulSoup(r.text, 'lxml')
def magnet(self): try: r = retry_on_fail(requests.head, self.download_url) if 'magnet' not in r: print('Aliased torrent, skipping...') return None return r except: return None
def last_entry(url): baka_url = url + 'browse.php' r = retry_on_fail(requests.get, baka_url) setattr(r, 'encoding', 'utf-8') page = BeautifulSoup(r.text, 'lxml') pager = page.find('div', class_='pager') pages = [] for link in pager.find_all('a', href=True): if (int(re.sub('.*?([0-9]*)$', r'\1', link['href']))): pages.append(re.sub('.*?([0-9]*)$', r'\1', link['href'])) return int(max(pages))
def last_entry(self): try: r = retry_on_fail(requests.get, self.url + 'anime.php?o=9') soup = BeautifulSoup(r.text, 'lxml') seasonal = soup.find('div', class_='js-categories-seasonal') link = seasonal.findAll('tr')[1].find('td').a['href'] if 'myanimelist.net/anime' in link: link = link.split('/')[4] return int(link) else: sys.exit('Failed retrieve last_entry') except: sys.exit('Failed retrieve last_entry')
def torrent(self): for link in self.page.find_all('a', class_='download_link', href=True): if '.torrent' in link['href']: parsed_uri = urlparse(self.baka_url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) self.full_baka = domain + link['href'] r = retry_on_fail(requests.get, self.full_baka) if r.content: torrent = r.content return torrent elif '#' == link['href']: return None else: print('no link') return None
def __init__(self, nyaa, nyaa_id): self.info_url = '{}{}'.format(nyaa.info_url, nyaa_id) self.download_url = '{}{}&magnet=1'.format(nyaa.dl_url, nyaa_id) self.nyaa_id = '{}'.format(nyaa_id) r = retry_on_fail(requests.get, self.info_url) setattr(r, 'encoding', 'utf-8') self.page = BeautifulSoup(r.text, 'lxml') content = self.page.find('div', class_='content').text if 'The torrent you are looking for does not appear to be in the database' in content: #print('{}{} not exist...'.format(nyaa.info_url, nyaa_id)) self.exists = False elif 'The torrent you are looking for has been deleted' in content: print(2) self.exists = False else: self.exists = True
def retrieve_anime(id_ref=1, requester=request_passthrough): """Return the metadata for a particular show. Args: id_ref (Optional(int)): Internal show identifier requester (Optional(requests-like)): HTTP request maker This allows us to control/limit/mock requests. Return: None if we failed to download the page, otherwise a tuple of two dicts (retrieval information, anime information). The retrieval information will include the keys: success (bool): Was *all* the information was retrieved? (Some keys from anime information may be missing otherwise.) scraper_retrieved_at (datetime): When the request was completed. id_ref (int): id_ref of this anime. The anime information will include the keys: See tests/mal_scraper/test_anime.py::test_download_first """ url = get_url_from_id_ref(id_ref) #response = requester.get(url, headers = {'User-agent': 'test'}) # custom user agent to avoid 429 (too many requests) error response = retry_on_fail(requests.get, url) if not response: return 404 if not response.ok: return response.status_code soup = BeautifulSoup(response.content, 'html.parser') success, info = _process_soup(soup) if not success: logger.warn('Failed to properly process the page "%s".', url) retrieval_info = { 'success': success, 'scraper_retrieved_at': datetime.utcnow(), 'id_ref': id_ref, } return (retrieval_info, info)