def get_info_from_page(self): if self.current_url in self.exception_dictionary: return PageReturn(self.exception_dictionary[self.current_url], self.current_url) next_url = self.url_pattern.format(self.current_page + 1) req = requests.get(next_url) if req.ok: return PageReturn(next_url, self.current_url) else: return PageReturn(None, self.current_url)
def test_mcninja_next(self): expected_results = \ {"http://drmcninja.com/archives/comic/5p52/": PageReturn("http://drmcninja.com/archives/comic/7p1/", "D.A.R.E To Resist Ninja Drugs and Ninja Violence Part 2 p52"), "http://drmcninja.com/archives/comic/11p16/": PageReturn("http://drmcninja.com/archives/comic/11p17/", "Punch Dracula p16"), "http://drmcninja.com/archives/comic/11p56/": PageReturn("http://drmcninja.com/archives/comic/12p1/", "Punch Dracula p56"), "http://drmcninja.com/archives/comic/33p147/": PageReturn(None, "The End: Part 2 p147")} self.next_method_asserts(webcomic_list.mcninja_info, expected_results)
def test_smbc_next(self): req = requests.get("https://www.smbc-comics.com/comic/2002-09-05") soup = BeautifulSoup(req.text, "html.parser") last_page_url = soup.find('a', class_='last').get('href') req = requests.get(last_page_url) soup = BeautifulSoup(req.text, "html.parser") trim_length = len("Saturday Morning Breakfast Cereal - ") last_page_title = soup.find('title').string[trim_length:] expected_results = \ {"https://www.smbc-comics.com/comic/2002-09-05": PageReturn("https://www.smbc-comics.com/comic/2002-09-07", "2002-09-05"), "https://www.smbc-comics.com/comic/the-truth": PageReturn( "https://www.smbc-comics.com/comic/statistical-flowers-for-algernon", "The Truth"), last_page_url: PageReturn(None, last_page_title)} self.next_method_asserts(webcomic_list.smbc_info, expected_results)
def test_cucumber_quest_next(self): req = requests.get("http://cucumber.gigidigi.com/cq/page-1/") soup = BeautifulSoup(req.text, "html.parser") last_page_url = soup.find('a', class_='last-webcomic-link').get('href') req = requests.get(last_page_url) soup = BeautifulSoup(req.text, "html.parser") last_page_title = soup.find('header', class_='post-header').find('h1').string expected_results = \ {"http://cucumber.gigidigi.com/cq/page-1/": PageReturn("http://cucumber.gigidigi.com/cq/page-2/", "page 1"), "http://cucumber.gigidigi.com/cq/page-77/": PageReturn("http://cucumber.gigidigi.com/cq/bonus/", "page 77"), last_page_url: PageReturn(None, last_page_title)} self.next_method_asserts(webcomic_list.cucumber_quest_info, expected_results)
def cucumber_quest_info(soup): current_title = soup.find('header', class_='post-header').find('h1').string next_link_button = soup.find('a', class_='next-webcomic-link') if 'current-webcomic' in next_link_button['class']: next_link = None else: next_link = next_link_button.get('href') return PageReturn(next_link, current_title)
def smbc_info(soup): trim_length = len("Saturday Morning Breakfast Cereal - ") current_title = soup.find('title').string[trim_length:] next_link_button = soup.find('a', class_='next') if next_link_button is None: next_link = None else: next_link = next_link_button.get('href') return PageReturn(next_link, current_title)
def mcninja_info(soup, current_title=None): if current_title is None: current_series = soup.find('select', id='series_select').find( 'option', selected=True).string current_page = soup.find('select', id='page_select').find('option', selected=True).string current_title = '{} p{}'.format(current_series, current_page) next_link = soup.find('link', rel='next') if next_link: next_url = next_link.get('href') if '/archives/comic/' in next_url: # We found the next page return PageReturn(next_url, current_title) else: # This next url is just a news post, fetch it recursively to find the real next comic page req = requests.get(next_url) soup = BeautifulSoup(req.text, "html.parser") return mcninja_info(soup, current_title) else: # The last page return PageReturn(None, current_title)
def get_info_from_page(self): if self.current_page == 0: return PageReturn(self.first_page_url, None) req = requests.get(self.current_url) soup = BeautifulSoup(req.text, "html.parser") return self.page_info_function(soup)
def mcninja_next(soup): return PageReturn(soup.find('a', class_='next').get('href'), None)