def download(self, chapter: Chapter) -> int: start_url = chapter.url url = start_url chapter.clear_state() retrieved_all_pages = False while not retrieved_all_pages: response = requests.get(url) if response.status_code == 200: tree = html.fromstring(response.content) image_src = str(tree.xpath(self.re_download_path)[0]) image = requests.get(image_src, stream=True).content chapter.add_page(image) nav_next = str(tree.xpath(self.re_download_next_path)[0]) if nav_next.startswith('/'): nav_next = urljoin(self.base_url, nav_next) if start_url in nav_next: # next button navigates to next page of a chapter url = nav_next else: # next button navigates to next chapter retrieved_all_pages = True else: raise ConnectionError( _(F'Could not connect with {start_url} site, status code: {response.status_code}')) return chapter.number_of_pages()
def crawl_detail(self, manga: Manga) -> None: start_url = manga.url response = requests.get(start_url) if response.status_code == 200: tree = html.fromstring(response.content) for element in tree.xpath(self.re_chapter_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) chapter = Chapter(manga, title) chapter.url = url manga.add_chapter(chapter) else: raise ConnectionError(_(F'Could not connect with {start_url} site, status code: {response.status_code}'))
def download(self, chapter: Chapter) -> int: start_url = chapter.url try: with SeleniumDriver() as driver: driver.get(start_url) wait_for_page(driver, self.re_download_path) chapter.clear_state() content = driver.find_element_by_xpath('//*').get_attribute('outerHTML') tree = html.fromstring(content) for element in tree.xpath(self.re_download_path): image_src = str(element.xpath('@src')[0]) image = requests.get(image_src, stream=True).content chapter.add_page(image) return chapter.number_of_pages() except Exception as e: raise ConnectionError(_(F'Could not connect with {start_url} site, error message: {e}'))
def crawl_detail(self, manga: Manga) -> None: start_url = manga.url try: with SeleniumDriver() as driver: driver.get(start_url) wait_for_page(driver, self.re_chapter_path) content = driver.find_element_by_xpath('//*').get_attribute('outerHTML') tree = html.fromstring(content) # crawl for manga chapters for element in tree.xpath(self.re_chapter_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) chapter = Chapter(manga, title) chapter.url = url manga.add_chapter(chapter) except Exception as e: raise ConnectionError(_(F'Could not connect with {start_url} site, error message: {e}'))
def test_make_pdf2(self): """ Make pdf from pages in memory - simulated by manually added pages """ directory_name = os.path.dirname(__file__) dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter.number_of_pages() == 0) source_images_dir = os.path.join(directory_name, 'images') test_tmp_dir = os.path.join(directory_name, RESULTS_DIRECTORY) for file in os.listdir(source_images_dir): file_path = os.path.join(source_images_dir, file) if os.path.isfile(file_path): with open(file_path, 'rb') as f: dummy_chapter.add_page(f.read()) result, path_to_pdf = dummy_chapter.make_pdf(test_tmp_dir) self.assertTrue(result) self.assertTrue(dummy_chapter.number_of_pages() == 4) self.assertTrue(os.path.isfile(path_to_pdf)) self.assertTrue(os.path.getsize(path_to_pdf) > 0) os.unlink(path_to_pdf) shutil.rmtree(test_tmp_dir, ignore_errors=True)
def test_make_pdf1(self): """ Make pdf from previously downloaded images - simulated on copied files """ directory_name = os.path.dirname(__file__) dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter.number_of_pages() == 0) source_images_dir = os.path.join(directory_name, 'images') test_tmp_dir = os.path.join(directory_name, RESULTS_DIRECTORY) images_dir = dummy_chapter.get_download_path(test_tmp_dir) os.makedirs(images_dir, exist_ok=True) for file in os.listdir(source_images_dir): file_path = os.path.join(source_images_dir, file) if os.path.isfile(file_path): shutil.copy(file_path, images_dir) result, path_to_pdf = dummy_chapter.make_pdf(test_tmp_dir) self.assertTrue(result) self.assertTrue(dummy_chapter.number_of_pages() == 0) self.assertTrue(os.path.isfile(path_to_pdf)) self.assertTrue(os.path.getsize(path_to_pdf) > 0) os.unlink(path_to_pdf) shutil.rmtree(test_tmp_dir, ignore_errors=True)
def download(self, chapter: Chapter) -> int: start_url = chapter.url url = start_url chapter.clear_state() with requests.Session() as s: response = s.get(url) if response.status_code == 200: tree = html.fromstring(response.content) pages_count = len(tree.xpath('/html/body/div[2]/div[4]/div/div/span/select[2]/option')) page_url_start = str(tree.xpath(self.re_download_path)[0]) image_ext = page_url_start[page_url_start.rfind('.'):] page_url_start = page_url_start[:page_url_start.rfind('.')] digits_for_page_number = len(page_url_start.split('-')[-1]) page_url_start = page_url_start[:page_url_start.rfind('-')] for page_number in range(1, 1 + pages_count): image = s.get(F'{page_url_start}-{page_number:0{digits_for_page_number}}{image_ext}', stream=True).content chapter.add_page(image) else: raise ConnectionError( _(F'Could not connect with {start_url} site, status code: {response.status_code}')) return chapter.number_of_pages()
def test_manga_1(self): dummy_manga_site = MangaSite('test_site') dummy_manga1 = Manga('test_manga', 'test_manga_url', dummy_manga_site) self.assertTrue(dummy_manga1.title == 'test_manga') dummy_manga1.downloaded = True self.assertTrue(dummy_manga1.downloaded) dummy_chapter1 = Chapter(dummy_manga1) dummy_manga1.add_chapter(dummy_chapter1) self.assertTrue(len(dummy_manga1.chapters) > 0) dummy_manga1.clear_state() self.assertFalse(dummy_manga1.downloaded) self.assertTrue(len(dummy_manga1.chapters) == 0) try: dummy_manga1.get_download_path(os.getcwd()) except Exception as e: self.assertIsInstance(e, AttributeError) self.assertTrue("NoneType" in str(e)) dump = dummy_manga1.dump() self.assertIsInstance(dump, bytes) self.assertTrue(len(dump) > 0)
def crawl_chapter(self, chapter: Chapter): site = self.cwd_site crawler = self.__get_crawler(site.site_name) if crawler: crawler.download(chapter) chapter.set_downloaded(True)
def test_chapter_1(self): dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter1 = Chapter(dummy_manga) self.assertTrue(dummy_chapter1.number_of_pages() == 0) self.assertIsNone(dummy_chapter1.title) dummy_chapter2 = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter2.title == 'test_chapter_title') dummy_chapter2.set_downloaded(True) self.assertTrue(dummy_chapter2.downloaded) self.assertTrue(dummy_chapter2.in_memory) dummy_chapter2.clear_state() self.assertFalse(dummy_chapter2.downloaded) self.assertFalse(dummy_chapter2.in_memory) try: dummy_chapter2.get_download_path(os.getcwd()) except Exception as e: self.assertIsInstance(e, AttributeError) self.assertTrue("NoneType" in str(e)) dump = dummy_chapter2.dump() self.assertIsInstance(dump, bytes) self.assertTrue(len(dump) > 0)