def crawl_detail(self, manga: Manga) -> None: start_url = manga.url response = requests.get(start_url) if response.status_code == 200: tree = html.fromstring(response.content) for element in tree.xpath(self.re_chapter_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) chapter = Chapter(manga, title) chapter.url = url manga.add_chapter(chapter) else: raise ConnectionError(_(F'Could not connect with {start_url} site, status code: {response.status_code}'))
def test_make_pdf2(self): """ Make pdf from pages in memory - simulated by manually added pages """ directory_name = os.path.dirname(__file__) dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter.number_of_pages() == 0) source_images_dir = os.path.join(directory_name, 'images') test_tmp_dir = os.path.join(directory_name, RESULTS_DIRECTORY) for file in os.listdir(source_images_dir): file_path = os.path.join(source_images_dir, file) if os.path.isfile(file_path): with open(file_path, 'rb') as f: dummy_chapter.add_page(f.read()) result, path_to_pdf = dummy_chapter.make_pdf(test_tmp_dir) self.assertTrue(result) self.assertTrue(dummy_chapter.number_of_pages() == 4) self.assertTrue(os.path.isfile(path_to_pdf)) self.assertTrue(os.path.getsize(path_to_pdf) > 0) os.unlink(path_to_pdf) shutil.rmtree(test_tmp_dir, ignore_errors=True)
def test_make_pdf1(self): """ Make pdf from previously downloaded images - simulated on copied files """ directory_name = os.path.dirname(__file__) dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter.number_of_pages() == 0) source_images_dir = os.path.join(directory_name, 'images') test_tmp_dir = os.path.join(directory_name, RESULTS_DIRECTORY) images_dir = dummy_chapter.get_download_path(test_tmp_dir) os.makedirs(images_dir, exist_ok=True) for file in os.listdir(source_images_dir): file_path = os.path.join(source_images_dir, file) if os.path.isfile(file_path): shutil.copy(file_path, images_dir) result, path_to_pdf = dummy_chapter.make_pdf(test_tmp_dir) self.assertTrue(result) self.assertTrue(dummy_chapter.number_of_pages() == 0) self.assertTrue(os.path.isfile(path_to_pdf)) self.assertTrue(os.path.getsize(path_to_pdf) > 0) os.unlink(path_to_pdf) shutil.rmtree(test_tmp_dir, ignore_errors=True)
def crawl_detail(self, manga: Manga) -> None: start_url = manga.url try: with SeleniumDriver() as driver: driver.get(start_url) wait_for_page(driver, self.re_chapter_path) content = driver.find_element_by_xpath('//*').get_attribute('outerHTML') tree = html.fromstring(content) # crawl for manga chapters for element in tree.xpath(self.re_chapter_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) chapter = Chapter(manga, title) chapter.url = url manga.add_chapter(chapter) except Exception as e: raise ConnectionError(_(F'Could not connect with {start_url} site, error message: {e}'))
def test_manga_1(self): dummy_manga_site = MangaSite('test_site') dummy_manga1 = Manga('test_manga', 'test_manga_url', dummy_manga_site) self.assertTrue(dummy_manga1.title == 'test_manga') dummy_manga1.downloaded = True self.assertTrue(dummy_manga1.downloaded) dummy_chapter1 = Chapter(dummy_manga1) dummy_manga1.add_chapter(dummy_chapter1) self.assertTrue(len(dummy_manga1.chapters) > 0) dummy_manga1.clear_state() self.assertFalse(dummy_manga1.downloaded) self.assertTrue(len(dummy_manga1.chapters) == 0) try: dummy_manga1.get_download_path(os.getcwd()) except Exception as e: self.assertIsInstance(e, AttributeError) self.assertTrue("NoneType" in str(e)) dump = dummy_manga1.dump() self.assertIsInstance(dump, bytes) self.assertTrue(len(dump) > 0)
def test_manga_site_1(self): dummy_manga_site1 = MangaSite() self.assertIsNone(dummy_manga_site1.site_name) dummy_manga_site2 = MangaSite('test_site') dummy_manga1 = Manga('test_manga', 'test_manga_url', dummy_manga_site2) dummy_manga_site2.add_manga(dummy_manga1) self.assertTrue(len(dummy_manga_site2.mangas) > 0) dummy_manga_site2.clear_state() self.assertTrue(len(dummy_manga_site2.mangas) == 0) dump = dummy_manga_site2.dump() self.assertIsInstance(dump, bytes) self.assertTrue(len(dump) > 0)
def crawl_index(self, manga_site: MangaSite) -> None: start_url = urljoin(self.base_url, self.manga_index) response = requests.get(start_url) if response.status_code == 200: manga_site.url = self.base_url tree = html.fromstring(response.content) for element in tree.xpath(self.re_index_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) manga = Manga(title, url, manga_site) manga_site.add_manga(manga) else: raise ConnectionError( _(F'Could not connect with {start_url} site, status code: {response.status_code}'))
def test_chapter_1(self): dummy_manga_site = MangaSite('test_site') dummy_manga = Manga('test_manga', 'test_manga_url', dummy_manga_site) dummy_chapter1 = Chapter(dummy_manga) self.assertTrue(dummy_chapter1.number_of_pages() == 0) self.assertIsNone(dummy_chapter1.title) dummy_chapter2 = Chapter(dummy_manga, 'test_chapter_title') self.assertTrue(dummy_chapter2.title == 'test_chapter_title') dummy_chapter2.set_downloaded(True) self.assertTrue(dummy_chapter2.downloaded) self.assertTrue(dummy_chapter2.in_memory) dummy_chapter2.clear_state() self.assertFalse(dummy_chapter2.downloaded) self.assertFalse(dummy_chapter2.in_memory) try: dummy_chapter2.get_download_path(os.getcwd()) except Exception as e: self.assertIsInstance(e, AttributeError) self.assertTrue("NoneType" in str(e)) dump = dummy_chapter2.dump() self.assertIsInstance(dump, bytes) self.assertTrue(len(dump) > 0)
def crawl_index(self, manga_site: MangaSite) -> None: start_url = urljoin(self.base_url, self.manga_index) try: with SeleniumDriver() as driver: collected_all_pages = False driver.get(start_url) wait_for_page(driver, self.re_index_path) manga_site.url = self.base_url while collected_all_pages is False: content = driver.find_element_by_xpath('//*').get_attribute('outerHTML') tree = html.fromstring(content) for element in tree.xpath(self.re_index_path): title = str(element.xpath('text()')[0]).strip().replace('\t', ' ') url = urljoin(self.base_url, str(element.xpath('@href')[0])) manga = Manga(title, url, manga_site) manga_site.add_manga(manga) for element2 in tree.xpath(self.re_index_next_page): if 'Next'.lower() in element2.xpath('text()')[0].lower(): driver.get(urljoin(self.base_url, element2.xpath('@href')[0])) break else: collected_all_pages = True except Exception as e: raise ConnectionError(_(F'Could not connect with {start_url} site, error message: {e}'))