def run(self): parser = argparse.ArgumentParser() parser.add_argument("url", help="URL to download") parser.add_argument("--limit", type=int, default=0, help="Max number of articles to download") parser.add_argument('--clean_cache', action='store_true') args = parser.parse_args() if not args.url: parser.print_help() exit(1) url = self._sanitize_url(args.url) if args.clean_cache: Cache(url).clean() return ebook = Ebook(url, args.limit, Cache(url)) try: ebook.assemble() print("Wrote %s to %s" % (ebook.get_title(), ebook.get_filename())) except base.FilterNotFoundError, e: print(""" ERROR: Blook could not figure out how to parse {url}. To add support for downloading this blog, please create an issue at https://github.com/kchodorow/blook/issues with the following title: {msg} for {url} Blook created a file called 'unparsable.html' in this directory, which contains the HTML it didn't recognize. Please attach it to the GitHub issue. """.format(url=url, msg=e.message))
def test_redirect(self): cache = TestCache() cache.put('https://www.example.com', self.NON_INDEX) cache.put('https://www.example.com/blog', self.INDEX) ebook = Ebook('https://www.example.com', 10, cache) ebook.assemble() self.assertEqual('Nhl', ebook.get_title()) self.assertEqual('Nhl.epub', ebook.get_filename())
def test_no_redirect(self): cache = TestCache() cache.put('https://www.example.com/blog', self.NON_INDEX) ebook = Ebook('https://www.example.com/blog', 10, cache) try: ebook.assemble() self.fail('Expected ebook.assemble to throw') except ListingNotFoundError, expected: self.assertEqual('No listing filter found', expected.message)
def __init__(self, path, source_language, target_language, engine="Google"): self.source_language = source_language self.target_language = target_language self.ebook = Ebook(path) self.set_out_path() self.set_counter_path() self.set_start_point() self.translator = Translator(source_language, target_language, engine)
def download(): ebook = Ebook(title) for link in links: print(link) chapter_name = link.get_text() page = requests.get(urllib.parse.urljoin(BASE_URL, link['href'])) soup = BeautifulSoup(page.content, 'html.parser', from_encoding="gb18030") # create chapter header chapter = Chapter(chapter_name) pageLink = soup.find('p', class_='pageLink') sub_pages = None if pageLink: sub_pages = pageLink.find_all('a', href=True) pageLink.extract() c = get_content(soup) # if there are sub pages if sub_pages: hrefs = [l['href'] for l in sub_pages] hrefs = list(set(hrefs)) hrefs.sort() print(hrefs) for href in hrefs: sub_page = requests.get(urllib.parse.urljoin(BASE_URL, href)) soup_link = BeautifulSoup(sub_page.content, 'html.parser', from_encoding="gb18030") sub_cont = get_content(soup_link) c.append(sub_cont) chapter.set_content(str(c)) ebook.add_chapter(chapter) ebook.save()
def create_collections_by_filesystem(root): all_dirs = glob.glob(root + "/documents/**/*/", recursive=True) toplevel_dirs = glob.glob(root + "/documents/*/") if all_dirs != toplevel_dirs: print("only single level of nesting supported currently") exit() collections = {} for directory in toplevel_dirs: # normpath to remove trailing slash directory_name = os.path.basename(os.path.normpath(directory)) collection_name = directory_name + "@en-US" collections[collection_name] = {"items": [], "lastaccess": 0} filepaths = glob.glob(directory + "/*") for filepath in filepaths: e = Ebook(filepath) collections[collection_name]["items"] += [e.fileident()] return collections
def download(): ebook = Ebook(title) for link in links: print(link) chapter_name = link.get_text() page = requests.get(urllib.parse.urljoin(BASE_URL, link['href'])) soup = BeautifulSoup(page.content, 'html.parser', from_encoding="gb18030") # create chapter header chapter = Chapter(chapter_name) chapter_content = get_content(soup) chapter.set_content(chapter_content) ebook.add_chapter(chapter) ebook.save()
def rename_files(root): all_files = glob.glob(root + "/documents/**/*.*", recursive=True) for filepath in all_files: ext = os.path.splitext(filepath)[1] directory = os.path.dirname(filepath) e = Ebook(filepath) if e.author: new_filename = u"[{}]-{}".format(e.author, e.title) else: new_filename = e.title #make the filename appropriate for FAT filesystem new_filename = new_filename.replace(" ", "_") new_filename = new_filename.replace("/", "_") new_filename = new_filename.replace("\\", "_") new_filename = new_filename.replace("*", "_") new_filename = new_filename.replace("?", "_") new_filename = new_filename.replace('"', "_") new_filename = new_filename.replace("'", "_") new_filename = new_filename.replace(":", "_") new_filename = new_filename.replace("|", "_") new_filename = new_filename.replace("!", "_") new_path = os.path.join(directory, new_filename + ext) os.rename(filepath, new_path)
from ebook import Ebook k = Ebook('Lalka', 'Bolesław Prus', 668) k.open() k.next_page() k.next_page() k.next_page() k.next_page() k.show_status() k.close()
def setUp(self): path_to_text = os.path.join(THIS_DIR, "test_data", "zen_en.txt") self.ebook = Ebook(path_to_text)
def setUp(self): self.book = Ebook()
metavar='N', type=str, nargs='+', help='path to infile txt') parser.add_argument('--lang', metavar='N', type=str, nargs='+', help='path outfile html') args = parser.parse_args() assert args.input and args.lang, parser.description path = args.input[0] text = Ebook(path).sentences translator = Translator(args.lang[0]) enumerated_sentences = [{ "position": position, "sentence": sentence } for position, sentence in enumerate(text)] def translate(sen_dict): global total translated = translator.translate(sen_dict["sentence"]) print(round(sen_dict["position"] / total * 100, 2), r"%") return { "position": sen_dict["position"], "original": sen_dict["sentence"], "translated": translated
from ebook import Ebook url = input('请输入需要下载的目录链接(https://www.zwdu.com/): ') book = Ebook(url) book.run()