def test_no_redirect(self): cache = TestCache() cache.put('https://www.example.com/blog', self.NON_INDEX) ebook = Ebook('https://www.example.com/blog', 10, cache) try: ebook.assemble() self.fail('Expected ebook.assemble to throw') except ListingNotFoundError, expected: self.assertEqual('No listing filter found', expected.message)
class EbookTest(unittest.TestCase): def setUp(self): path_to_text = os.path.join(THIS_DIR, "test_data", "zen_en.txt") self.ebook = Ebook(path_to_text) def test_1_read(self): self.ebook.read() self.assertIsInstance(self.ebook.content, str) def test_2_split_book_into_sentences(self): self.ebook.split_into_sentences() self.assertIsInstance(self.ebook.sentences, list)
def make_ebook(self): """Makes an ebook object by iterating through each chapter url, converting it in to a chapter object, and appending it to the ebook's chapterlist""" ebook = Ebook(self.title) for link in self.get_chapter_links(): chapter = self.makeChapter(link) ebook.chapters.append(chapter) return ebook
def read_docs_folder(root): extensions = [ 'pdf', 'mobi', 'prc', 'txt', 'tpz', 'azw1', 'azw', 'manga', 'azw2', 'azw3' ] file_paths = [] for extension in extensions: file_paths += glob.glob(root + "/documents/**/*.{}".format(extension), recursive=True) output = [] for file_path in file_paths: e = Ebook(file_path) file_properties = {} file_properties["path"] = file_path file_properties["name"] = os.path.basename(file_path) file_properties["canonic_filename"] = e.path file_properties["hash"] = e.hash file_properties["asin"] = e.asin file_properties["processed"] = False output += [file_properties] return output
class TestSequenceFunctions(unittest.TestCase): def setUp(self): self.book = Ebook() def test_1_line(self): text = "one line of text" self.assertEqual(self.book.escape_text_to_html(text), text) def test_2_line(self): text = "one line of text\nsecond line" self.assertEqual(self.book.escape_text_to_html(text), text) def test_unicode(self): text = u"§" self.assertEqual(self.book.escape_text_to_html(text), "§") def test_greater_than_and_less_than(self): text = "0 < 1 and 2 > 1" self.assertEqual(self.book.escape_text_to_html(text), "0 < 1 and 2 > 1") def test_html_tags(self): text = "<b>bold</b> plain text" self.assertEqual(self.book.escape_text_to_html(text), text) def test_more_html_tags(self): text = "<i>bold</i> plain text" self.assertEqual(self.book.escape_text_to_html(text), text) text = "<b>bold</b> ̀ plain text" self.assertEqual(self.book.escape_text_to_html(text), text) def test_escaped_chars_are_ignored(self): text = r"\&" self.assertEqual(self.book.escape_text_to_html(text), "&amp;") text = r"\<b\> <b> \<b\> \<b>" self.assertEqual(self.book.escape_text_to_html(text), "<b> <b> <b> <b>")
def create_collections_by_filesystem(root): all_dirs = glob.glob(root + "/documents/**/*/", recursive=True) toplevel_dirs = glob.glob(root + "/documents/*/") if all_dirs != toplevel_dirs: print("only single level of nesting supported currently") exit() collections = {} for directory in toplevel_dirs: # normpath to remove trailing slash directory_name = os.path.basename(os.path.normpath(directory)) collection_name = directory_name + "@en-US" collections[collection_name] = {"items": [], "lastaccess": 0} filepaths = glob.glob(directory + "/*") for filepath in filepaths: e = Ebook(filepath) collections[collection_name]["items"] += [e.fileident()] return collections
def __init__(self, path, source_language, target_language, engine="Google"): self.source_language = source_language self.target_language = target_language self.ebook = Ebook(path) self.set_out_path() self.set_counter_path() self.set_start_point() self.translator = Translator(source_language, target_language, engine)
def run(self): parser = argparse.ArgumentParser() parser.add_argument("url", help="URL to download") parser.add_argument("--limit", type=int, default=0, help="Max number of articles to download") parser.add_argument('--clean_cache', action='store_true') args = parser.parse_args() if not args.url: parser.print_help() exit(1) url = self._sanitize_url(args.url) if args.clean_cache: Cache(url).clean() return ebook = Ebook(url, args.limit, Cache(url)) try: ebook.assemble() print("Wrote %s to %s" % (ebook.get_title(), ebook.get_filename())) except base.FilterNotFoundError, e: print(""" ERROR: Blook could not figure out how to parse {url}. To add support for downloading this blog, please create an issue at https://github.com/kchodorow/blook/issues with the following title: {msg} for {url} Blook created a file called 'unparsable.html' in this directory, which contains the HTML it didn't recognize. Please attach it to the GitHub issue. """.format(url=url, msg=e.message))
def test_assemble(self): cache = TestCache() cache.put('https://www.example.com', self.INDEX) ebook = Ebook('https://www.example.com', 2, cache) ebook.assemble() self.assertEqual('Nhl', ebook.get_title()) self.assertEqual('Nhl.epub', ebook.get_filename())
def rename_files(root): all_files = glob.glob(root + "/documents/**/*.*", recursive=True) for filepath in all_files: ext = os.path.splitext(filepath)[1] directory = os.path.dirname(filepath) e = Ebook(filepath) if e.author: new_filename = u"[{}]-{}".format(e.author, e.title) else: new_filename = e.title #make the filename appropriate for FAT filesystem new_filename = new_filename.replace(" ", "_") new_filename = new_filename.replace("/", "_") new_filename = new_filename.replace("\\", "_") new_filename = new_filename.replace("*", "_") new_filename = new_filename.replace("?", "_") new_filename = new_filename.replace('"', "_") new_filename = new_filename.replace("'", "_") new_filename = new_filename.replace(":", "_") new_filename = new_filename.replace("|", "_") new_filename = new_filename.replace("!", "_") new_path = os.path.join(directory, new_filename + ext) os.rename(filepath, new_path)
def download(): ebook = Ebook(title) for link in links: print(link) chapter_name = link.get_text() page = requests.get(urllib.parse.urljoin(BASE_URL, link['href'])) soup = BeautifulSoup(page.content, 'html.parser', from_encoding="gb18030") # create chapter header chapter = Chapter(chapter_name) pageLink = soup.find('p', class_='pageLink') sub_pages = None if pageLink: sub_pages = pageLink.find_all('a', href=True) pageLink.extract() c = get_content(soup) # if there are sub pages if sub_pages: hrefs = [l['href'] for l in sub_pages] hrefs = list(set(hrefs)) hrefs.sort() print(hrefs) for href in hrefs: sub_page = requests.get(urllib.parse.urljoin(BASE_URL, href)) soup_link = BeautifulSoup(sub_page.content, 'html.parser', from_encoding="gb18030") sub_cont = get_content(soup_link) c.append(sub_cont) chapter.set_content(str(c)) ebook.add_chapter(chapter) ebook.save()
def download(): ebook = Ebook(title) for link in links: print(link) chapter_name = link.get_text() page = requests.get(urllib.parse.urljoin(BASE_URL, link['href'])) soup = BeautifulSoup(page.content, 'html.parser', from_encoding="gb18030") # create chapter header chapter = Chapter(chapter_name) chapter_content = get_content(soup) chapter.set_content(chapter_content) ebook.add_chapter(chapter) ebook.save()
from ebook import Ebook k = Ebook('Lalka', 'Bolesław Prus', 668) k.open() k.next_page() k.next_page() k.next_page() k.next_page() k.show_status() k.close()
from ebook import Ebook url = input('请输入需要下载的目录链接(https://www.zwdu.com/): ') book = Ebook(url) book.run()
def setUp(self): path_to_text = os.path.join(THIS_DIR, "test_data", "zen_en.txt") self.ebook = Ebook(path_to_text)
def setUp(self): self.book = Ebook()
metavar='N', type=str, nargs='+', help='path to infile txt') parser.add_argument('--lang', metavar='N', type=str, nargs='+', help='path outfile html') args = parser.parse_args() assert args.input and args.lang, parser.description path = args.input[0] text = Ebook(path).sentences translator = Translator(args.lang[0]) enumerated_sentences = [{ "position": position, "sentence": sentence } for position, sentence in enumerate(text)] def translate(sen_dict): global total translated = translator.translate(sen_dict["sentence"]) print(round(sen_dict["position"] / total * 100, 2), r"%") return { "position": sen_dict["position"], "original": sen_dict["sentence"], "translated": translated