def download_news(request, url, newsname): print('进入新闻页下载的处理函数') FromUserName = get_ACCESS_TOKEN(request) my_first_epub = pypub.Epub(newsname.decode()) # my_first_epub = pypub.Epub('%s' % user) my_first_chapter = pypub.create_chapter_from_url(url) my_first_epub.add_chapter(my_first_chapter) my_first_epub.create_epub('download/' + FromUserName) list = User.objects.filter(username=FromUserName, newsname=newsname) print(list) # list = [] if len(list) != 0: data = '该新闻已被保存,请选择其它新闻。' return render(request, 'test6.html', {'data': data}) else: path = 'download' + '/' + FromUserName + '/' + newsname + '.epub' print(path) with open(path) as f: news_data = f.read() md5str = news_data #m = hashlib.md5(str(time.clock()).encode('utf-8')) #m.hexdigest() m1 = hashlib.md5() m1.update(md5str.decode('latin-1')) token = m1.hexdigest() User.objects.create(username=FromUserName, newsname=newsname, status=False, md5=token) data = '该新闻页保存成功' return render(request, 'test6.html', {'data': data})
def open_file(singlelined=()): #opens each file for processing del_file() #adds every .txt file not called patterns.txt as chapters in a book filelist = sorted([ file for file in os.listdir(os.getcwd()) if file.endswith(".txt") and file != "patterns.txt" ], key=extract_num) EPUBNAME = filelist[0] #creates an epub epub = pypub.Epub(EPUBNAME) for file in filelist: if file in singlelined: #on some protected google drive docs, there are no paragraph breaks so the text is a solid wall of text #if singleline is true, the program will look for punctuation at the end parse_file(file, singleline=True) else: parse_file(file, singleline=False) #creates a chapter from each .txt file create_epub_ch(epub, file) paths = os.getcwd() epub.create_epub(paths) EPUBNAME = filelist[0] + ".epub" return EPUBNAME
def main(args): url = args.url log.info("Using URL: {}".format(url)) response = requests.get(url) if args.debug: if not os.path.exists(url.split('/')[-1]): with (url.split('/')[-1], 'w+') as f: f.write(response.content) else: with (url.split('/')[-1], 'r') as f: response = f.read() soup = BeautifulSoup(response.content, 'html5lib') title = soup.find('h1').text log.info("Title is: {}".format(title)) author = soup.find_all('h5')[0].text.strip().replace(u'\xa0', u' ')[3:] log.info("Author is: {}".format(author)) # chapter_titles = soup.findAll('h5', attrs={"class": "modal-title"})[1:] chapter_titles = [ x.text.strip() for x in soup.findAll( 'div', attrs={"class": "alert alert-info xy_alertheader"}) ][:-1] for e in chapter_titles: log.info("Found Chapter: {}".format(str(e.encode('utf-8')))) log.info("Number of chapters found: {}".format(len(chapter_titles))) title_string = "{} - by {}".format(str(title.encode('utf-8').strip()), author) log.info('Book name is: {}'.format(title_string)) epub = pypub.Epub(title_string, creator=author) for num, z in enumerate(soup.findAll('div', attrs={"class": "xy_partbg p-4"}), start=0): try: assert chapter_titles[num] # TODO: URL is no longer correct, need to find method to pull from page log.info('Adding: {}#Part_{}'.format(url, num)) if num == 0: # This is needed to remove the overlay letter from the page c = pypub.create_chapter_from_string( "<h1>Part {}</h1>".format(num + 1) + str(z.find('div', attrs={'class': 'xy_overlaytext'})), title=chapter_titles[num]) else: c = pypub.create_chapter_from_string( "<h1>Part {}</h1>".format(num + 1) + str(z), title=chapter_titles[num]) epub.add_chapter(c) del (c) except ValueError as e: raise ValueError(e) except IndexError: pass output = None if args.output: output = os.path.expanduser(args.output) else: output = os.getcwd() epub.create_epub(output, epub_name=title_string.replace(" ", " "))
def collect(self, path: str, no_cache: bool = False) -> str: """ Scrapes a novel to a given output directory. :param path: The output directory. :param no_cache: Whether to force rescraping of cached chapters. :return: The name of the EPUB file. """ epub = pypub.Epub( self.metadata["title"], creator=", ".join(self.metadata["authors"] + self.metadata["artists"]), language=", ".join(self.metadata["languages"]), rights=self.metadata["licensed"], publisher=", ".join(self.metadata["original_publishers"] + self.metadata["english_publishers"])) translator_dict = defaultdict(lambda: []) for i, p in enumerate(self.chapter_list): translator_dict[p.translator].append((i, p)) chapter_data = [] chapters = set() for t in sorted(list(translator_dict.values()), key=len, reverse=True): for i, p in t: if p.name not in chapters: chapter_data.append((i, p)) chapters.add(p.name) chapter_data.sort(key=lambda x: x[0]) with ThreadPool(THREAD_COUNT) as p: pages = p.map(lambda x: x[1].get(no_cache=no_cache), chapter_data) for c in pages: epub.add_chapter(c) # TODO: Submit PR to pypub fork and replace this atrocious workaround # Replace open function temporarily to affect library behavior old_open = open def new_open(*args, **kwargs): utf8_open = partial(old_open, encoding="utf-8") try: return utf8_open(*args, **kwargs) except ValueError: return old_open(*args, **kwargs) builtins.open = new_open epub.create_epub(path) # Restore old open function builtins.open = old_open return f"{epub.title}.epub"
def _create_epub_single(files, output, title): import pypub creator = "Anonymous" language = 'cn' rights = now() publisher = 'Anonymous' print('Creating epub "%s" include %s chapters' % (title, len(files))) book = pypub.Epub(title, creator=creator, language=language, rights=rights, publisher=publisher) for file in files: name = os.path.basename(file) c_title = os.path.splitext(name)[0] c_file = file book.add_chapter(pypub.create_chapter_from_file(c_file, c_title)) book.create_epub(output, epub_name=title)
from bs4 import BeautifulSoup import os import pypub epub = pypub.Epub('Site Reliability Engineering') def setup_toc(): soup = BeautifulSoup(open('./html/index.html'), 'html.parser') links = soup.select('.content a ') for link in links: print(link['href']) add_chapter_file(link['href'], link.get_text()) epub.create_epub(os.path.abspath('./build')) def add_chapter_file(href, title): file_path = href.replace('/sre/book/', 'html/') with open(file_path, 'r') as f: contents = f.read() chapter_soup = BeautifulSoup(contents, 'html.parser') chapter_soup = chapter_soup.select_one('.content') links = chapter_soup.select_all('a') for link in links: link.href = link.href.replace('/sre/book/chapters/', '') chapter = pypub.create_chapter_from_string( chapter_html, url=None, title=title) epub.add_chapter(chapter)
def main(): global epub_dir if args.url: url = args.url else: url = input('URL: ').strip() url = process_url(url) #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this # url = "/".join(url.split('/')[:-1]) parsed_uri = urlparse(url) netloc = '{uri.netloc}/'.format(uri=parsed_uri) d_name = slugify(unicode(netloc)) if args.pdf: if (not args.one) and (not os.path.isdir(d_name)): os.makedirs(d_name) ext = '.pdf' else: ext = '.epub' if args.print_date: print('Debugging\n') scrape(url, d_name, ext) elif args.one: d_name = d_name.strip() if args.pdf: fname = d_name + ext else: #.epub will auto suffix fname = d_name + ext fpath = os.path.join(os.getcwd(), fname) while os.path.exists(fpath): fname = d_name + '_' + str(int(time.time())) + ext fpath = os.path.join(os.getcwd(), fname) try: if args.pdf: print('Create single pdf: ' + fpath) pdfkit.from_url(url, fpath) else: import_pypub() tmp_dir = d_name + temp_dir_ext my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir) print('Create single epub: ' + fpath) while True: try: print('\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url) epub_dir = os.path.join(os.getcwd(), tmp_dir) try: my_chapter = pypub.create_chapter_from_url(url) my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() except ValueError as ve: #https://pikachu.com is an invalid url or no network connection print(ve) reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break except IOError as ioe: #should allow next url if requests.get() in pypub's chapter.py timeout print("\nIOError but still allow goto next chapter", ioe) except KeyboardInterrupt: #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls. reply = input( '\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] ' ).strip() #or ctrl+c again also can exit if reply: if reply == 's': reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break elif reply == 'q': break #else #continue/retry except IOError as ioe: print("IOError --one: ", ioe) elif not args.all: print('Download in rss feed mode') if args.feed: url = args.feed #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working # url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25' while url: url = download(url, url, d_name, ext) elif args.single: print('Download single year/month in website mode') download(url, url, d_name, ext) else: print('Download all in website mode') scrape(url, d_name, ext) print("\nDone")
def download(url, h, d_name, ext): global download_once global init_url_once global img_css_style global my_epub global epub_dir if not args.pdf: import_pypub() #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/' visit_link = url orig_url = url if args.all: y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h print("Scraping year... " + y_url) try: r = urlopen(y_url).read() except HTTPError as he: print( '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n' ) clean_up() os._exit(1) if sys.version_info[0] >= 3: r = r.decode('utf-8') t = r.split("'title'") t = t[1:] else: url = process_rss_link(url) print("Scraping rss feed... " + url) r = feedparser.parse( url ) #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr #print(r.headers) t = r['entries'] #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link if (not init_url_once) and ( not t ): #'User does not have permission to read this blog.' of rss feed come here init_url_once = True #parsed_url = urlparse(url) #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'): try: print("Try to scrape rss feed url automatically ... " + orig_url) ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized if sys.version_info[0] >= 3: req = urllib.request.Request(orig_url, data=None, headers={'User-Agent': UA}) r = urllib.request.urlopen(req).read() else: req = urllib2.Request(orig_url, headers={'User-Agent': UA}) r = urllib2.urlopen(req).read() except Exception as e: print(e) print( "Request webpage failed, please check your network OR authorized to access that url." ) clean_up() os._exit( 1 ) #don't use sys.exit(-1) if don't want to traceback to main() to print exception soup = BeautifulSoup(r, "lxml") data = soup.findAll('link', attrs={'type': 'application/rss+xml'}) if not data: #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom data = soup.findAll('link', attrs={'type': 'application/atom+xml'}) if not data: data = soup.findAll('a', attrs={'href': '/rss/' }) #https://blog.google/products/ if data: url = data[0].get("href") url = process_rss_link(url) if url.startswith( '/' ): #http://sectools.org/tag/sploits/ only has href="/feed/" parsed_orig_uri = urlparse(orig_url) url = '{uri.scheme}://{uri.netloc}'.format( uri=parsed_orig_uri) + url print("Scraping rss feed one more time ... " + url) r = feedparser.parse(url) t = r['entries'] if not t: t = [] else: t = [] else: #unlike blogspot, wordpress always got t, so need set true here init_url_once = True parsed_url = urlparse(url) is_wordpress = '{uri.netloc}'.format( uri=parsed_url).endswith('wordpress.com') if not is_wordpress: #only check next if 1st check is False, or lese 2nd check override 1st result try: if 'keys' in dir(r): is_wordpress = r.get('feed', {}).get( 'generator', '').startswith('https://wordpress.org/') except Exception as e: print('parse generator error', e) if is_wordpress and t: #increment paged only if current page got entries, i.e. t #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs if 'paged=' in parsed_url.query: wp_paged_v = int( parsed_url.query[parsed_url.query.rindex('paged=') + len('paged='):]) #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format( uri=parsed_url) + parsed_url.query.replace( 'paged=' + str(wp_paged_v), 'paged=' + str(wp_paged_v + 1)) else: url = '' print('no next') elif ("keys" in dir(r)) and ('link' in r['feed'].keys()): l = r['feed']['links'] if l: got_next = False for ll in l: if ll['rel'] == 'next': #if ll['href'] != url: #don't have next link is same case to test url = ll['href'] got_next = True break if not got_next: url = '' else: url = '' elif not t: #no need care if next page rss index suddenly change and no content case url = '' print_rss_err() count = 0 for tt in t: count += 1 title_raw = '' title_is_link = False if not args.all: #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p') h = '' #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated' post_date = tt.get('published', tt.get('updated', '')) t_date = '' try: if args.locale: if sys.version_info[0] >= 3: t_date = parse_locale(post_date) else: t_date = parse_locale(post_date).decode('utf-8') else: t_date = date_parser.parse(post_date).strftime( '%B %d, %Y, %H:%M %p') except ValueError: #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54 0200' t_date = post_date for feed_links in tt['links']: if feed_links['rel'] == 'alternate': visit_link = feed_links['href'] title_raw = tt['title'].strip() title_pad = title_raw + ' ' if (not args.pdf) or ( not tt['title'] ): #epub got problem copy link from text, so epub always shows link tt['title'] = visit_link title_is_link = True if args.pdf: #pdf with img css causes image not appear at all img_css_style = '' author = tt.get('author_detail', {}).get('name') if not author: author = tt.get('site_name', '') #https://blog.google/rss/ h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[ 'title'] + '</a></i></small><br/><br/></div>' + img_css_style #<hr style="border-top: 1px solid #000000; background: transparent;"> media_content = '' try: if 'media_content' in tt: #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com for tm in tt['media_content']: #pitfall: python 3 dict no has_key() attr if ('medium' in tm) and (tm['medium'] == 'image') and 'url' in tm: media_content += '<img src="' + tm['url'] + '" >' #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >' #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content #... which seems act as single thumbnail on webpage scraping metadata usage only. #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage. #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail # for tm in tt['media_thumbnail']: # if 'url' in tm: # media_content += '<img src="' + tm['url'] + '" >' except Exception as e: print(e) print('parse media error') #pdfkit need specific charset, epub seems no need if args.pdf: #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[ 'summary'].replace( '<div class="separator"', '<div class="separator" align="center" ' ) + media_content + '</div></body>' #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>' else: #epub can't set body/head #h_soup = BeautifulSoup(tt['summary'], "lxml") #for pre in h_soup.find_all('pre'): # print("pre: ", pre) #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div> h = h + '<div align="center">' + tt['summary'].replace( '<br /><br /><br />', '<br />') + media_content + "</div>" #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>" #h = h + tt['summary'] + media_content title = tt['title'] t_url = visit_link else: field = tt.split("'") title = field[1] title_raw = title.strip() t_url = field[5] print('\ntitle: ' + title_raw) print('link: ' + t_url) if args.pdf: print('Download html as PDF, please be patient...' + str(count) + '/' + str(len(t))) else: print('Download html as EPUB, please be patient...' + str(count) + '/' + str(len(t))) if args.pdf: if title_is_link: #else just leave slash with empty title = '/'.join(title.split('/')[-3:]) if sys.version_info[0] >= 3: fname = os.path.join(d_name, slugify(unicode(title))) else: print(title) try: title = title.decode('utf-8') except: pass #print('calm down, is normal decode error') title = replacer(title) #fname = os.path.join( d_name, slugify(title.decode('utf-8'))) fname = os.path.join(d_name, slugify(title)) else: #no point do set fname based on title since epub is single file only with multiple chapters fname = d_name fpath = os.path.join(os.getcwd(), fname) if args.pdf: check_path = os.path.join(fpath + ext) else: check_path = fpath[:-1] + ext if (not download_once) and os.path.exists(check_path): if args.pdf: fpath = fpath + '_' + str(int(time.time())) + ext else: fname = fname[:-1] + ' ' + str(int( time.time())) #pypub truncated _, so can't use '_' else: if args.pdf: fpath += ext else: fpath = fpath[:-1] + ext fname = fname[:-1] if args.pdf: print("file path: " + fpath) #pdf = weasyprint.HTML(t_url).write_pdf() #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf) if args.all: try: pdfkit.from_url(t_url, fpath) except IOError as ioe: print("pdfkit IOError") else: try: #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this #, which you direct unzip .EPUB and open that xhtml will got error #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&` h = replacer(h) pdfkit.from_string(h, fpath) except IOError as ioe: print('Exception IOError: ' + repr(ioe)) else: if not download_once: download_once = True print("file path: " + fpath) if os.path.exists(fname + temp_dir_ext): print( fname + temp_dir_ext + " already exists, please move/backup that direcory to another place manually. Abort" ) #to not blindly replace file os._exit(1) tmp_dir = fname + temp_dir_ext my_epub = pypub.Epub(fname, epub_dir=tmp_dir) epub_dir = os.path.join(os.getcwd(), tmp_dir) print("epub_dir: " + epub_dir) if title_raw: try: title = title.decode('utf-8') except: pass try: #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html title_raw = title_raw.decode('utf-8') except: pass title_raw = replacer(title_raw).replace('&', '&').replace( '<', '<' ).replace( '>', '>' ) #unlike content, title can replace '&'(no space) like that since & may no space #, if content do like that will got no image, got visible text ...etc if args.all: if title_raw: my_chapter = pypub.create_chapter_from_url(title=title_raw, url=t_url) else: #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though my_chapter = pypub.create_chapter_from_url(t_url) #print(my_chapter.content) #my_chapter.content = replacer(my_chapter.content) my_chapter.title = replacer(my_chapter.title) #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error. #try: # my_chapter.content = my_chapter.content.decode('utf-8') #except: # pass #print("decode content err") # # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect ! #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser') try: my_chapter.title = my_chapter.title.decode('utf-8') except: #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html pass #print("decode title err") else: #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace if title_raw: my_chapter = pypub.create_chapter_from_string( h, title=title_raw, url=t_url) else: my_chapter = pypub.create_chapter_from_string( h, title='/'.join(title.split('/')[-3:]), url=t_url) #print(my_chapter.content) #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" ')) my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() return url #return value used for rss feed mode only
def generate_epub(name, category="", start=None, end=None): """ Main method. """ # Collect post to sort them after posts = {} for page in range(1, 4): sleep_for_a_while() r = requests.get( 'http://highscalability.com/blog/category/%s?currentPage=%s' % (category, page)) html_doc = r.text soup = BeautifulSoup(html_doc, 'html.parser') for post in soup.select(".journal-entry"): #print(post) post_link = post.select(".journal-entry-text h2 a")[0] post_date = post.select(".journal-entry-float-date")[0] # Collect the HREF # Note: the link is useless because the list page contains the full post text. href = post_link.attrs['href'] if not href.startswith("http://highscalability.com"): href = "http://highscalability.com%s" % href # Collect the title title = post_link.get_text() if not title: print("Fail to find the title: %s" % post) # Collect and parse the data date_text = post_date.get_text() # Ex: December 16, 2016 conv = time.strptime(date_text, "%b%d%Y") date_en = time.strftime("%Y-%m-%d", conv) # Ex: 2016-12-16 print(date_en) # Filter according the dates if start and date_en < start: continue if end and date_en >= end: continue print("Processing post %s (%s)" % (title, date_en)) # Collect the content # List pages contain only the beginning of the posts. # We need to retrieve each post page to get the full text sleep_for_a_while() r = requests.get(href) if r.status_code != 200: print("Error: Unable to retrieve blog post content: %s" % r.status_code) break post_doc = r.text post_soup = BeautifulSoup(post_doc, 'html.parser') content = post_soup.select(".journal-entry-text")[0] content_text = u"%s" % (str(content)) # Post are traversed in reverse order posts[date_en] = { "date": date_text, "title": title, "content": content_text } # Sort the post starting from the oldest ordered_posts = collections.OrderedDict(sorted(posts.items())) # Generate the target file epub = pypub.Epub(name) print("Creating the epub...") for date_en, post in ordered_posts.iteritems(): print("Adding post %s" % post["title"]) c = pypub.create_chapter_from_string(post["content"], title=post["title"]) epub.add_chapter(c) sleep_for_a_while() print("Ending epub generation") epub.create_epub(os.getcwd())
import os import json with open("serialinfo.json") as json_file: json_format = json.load(json_file) with open("webseriallist.json") as json_file: json_info = json.load(json_file) # Loop through the web serials in webseriallist.json for serials in json_info["webserials"]: title = serials["serialtitle"] author = serials["serialauthor"] chapterurl = serials["serialurl"] my_epub = pypub.Epub(title + ' by ' + author) print('Downloading ' + title + ' by ' + author) # Loop through the chapters in a web serial while True: # Check if chapter is excluded exclude = False with open("exclusionlist.txt", "r") as exclusionlist: while True: line = exclusionlist.readline() if not line: break if chapterurl == line.strip(): exclude = True
def main(): global epub_dir if args.url: url = args.url else: url = input('URL: ').strip() url = process_url(url) #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this # url = "/".join(url.split('/')[:-1]) parsed_uri = urlparse(url) netloc = '{uri.netloc}/'.format(uri=parsed_uri) d_name = slugify(unicode(netloc)) if args.pdf: if (not args.one) and (not os.path.isdir(d_name)): os.makedirs(d_name) ext = '.pdf' else: ext = '.epub' if args.print_date: print('Debugging\n') scrape(url, d_name, ext) elif args.one: d_name = d_name.strip() if args.pdf: fname = d_name + ext else: #.epub will auto suffix fname = d_name + ext fpath = os.path.join(os.getcwd(), fname) while os.path.exists(fpath): fname = d_name + '_' + str(int(time.time())) + ext fpath = os.path.join(os.getcwd(), fname) try: if args.pdf: # [further:0] 'https://thehackernews.com/2019/09/phpmyadmin-csrf-exploit.html' # ... nid -1 -p, can't simply -1 print('Create single pdf: ' + fpath) # test case(need default 3 seconds): https://www.quora.com/Why-does-the-loopback-interface-on-my-computer-has-65536-as-the-MTU-while-other-interfaces-has-1500-as-the-MTU pdfkit.from_url( url, fpath, options={'--javascript-delay': args.js_delay * 1000}) else: import_pypub() tmp_dir = d_name + temp_dir_ext my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir) print('Create single epub: ' + fpath) while True: try: print('\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url) epub_dir = os.path.join(os.getcwd(), tmp_dir) try: ''' import trace #print("sys path: ", sys.prefix, sys.exec_prefix) tracer = trace.Trace( trace=1, #ignoredirs=[sys.prefix, sys.exec_prefix] ) ignoredirs=[ '/usr/lib/python3/', '/usr/lib/python3.6/', '/usr/lib/python3.8/', '/home/xiaobai/.local/lib/python3.6/site-packages/lxml/', ], ignoremods=[ 'version', 'pyparsing', 'six', '_tokenizer', 'serialize', 'exceptions', 'request' , '_inputstream', 'etree', 'html5parser', '_structures', 'specifier', 'specifiers', 'serializer' , '_utils', '_compat' , '_htmlparser', 'element', 'dammit', 'universaldetector', 'codingstatemachine', 'utf8prober' , 'enums', 'mbcsgroupprober', 'charsetgroupprober', 'charsetprober', 'latin1prober' , 'charsetgroupprober', 'sbcharsetprober', 'hebrewprober', 'euctwprober', 'mbcharsetprober' , 'chardistribution', 'sbcsgroupprober', 'jpcntx', 'sjisprober', 'big5prober', 'cp949prober' , 'euckrprober', 'gb2312prober', 'eucjpprober', 'timeout', 'pyopenssl', 'SSL', 'poolmanager' , 'connectionpool', 'response', '_collections', 'core', 'intranges', 'binding', '_oid', 'x509' , 'decode_asn1', 'utils', 'extensions', 'general_name', 'cookies', 'models', 'structures' , '_internal_utils', 'sessions', 'adapters', 'hooks', 'retry' , 'connection', 'api', 'url', 'ssl_' , 'wait', 'crypto', '_util', 'backend', 'makefile' ] #count=1) ) ''' #my_chapter = tracer.runfunc(pypub.create_chapter_from_url, url) my_chapter = pypub.create_chapter_from_url(url) # To replace title contains "&"" to "&" , or else will not able open in kchmviewer # Test case: https://blog.semmle.com/semmle-discovers-severe-vulnerability-ghostscript-postscript-pdf/ my_chapter.title = my_chapter.html_title my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() except ValueError as ve: #https://pikachu.com is an invalid url or no network connection traceback.print_exc() print(ve) try: reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() except EOFError: #when use -1 and < list_of_lines_file, last line will raise EOFError break if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break except IOError as ioe: #should allow next url if requests.get() in pypub's chapter.py timeout print("\nIOError but still allow goto next chapter", ioe) except KeyboardInterrupt: #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls. reply = input( '\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] ' ).strip() #or ctrl+c again also can exit if reply: if reply == 's': reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break elif reply == 'q': break #else #continue/retry #except Exception, ex: # print('single global ex: ' + ex) except IOError as ioe: print("IOError --one: ", ioe) elif not args.all: print('Download in rss feed mode') if args.feed: url = args.feed #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working # url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25' while url: url = download(url, url, d_name, ext) elif args.single: print('Download single year/month in website mode') download(url, url, d_name, ext) else: print('Download all in website mode') scrape(url, d_name, ext) print("\nDone")
import pypub my_first_epub = pypub.Epub('My Second Epub') my_first_chapter = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/FBReader') my_first_epub.add_chapter(my_first_chapter) my_first_chapter4 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/PocketBook_eReader') my_first_epub.add_chapter(my_first_chapter4) my_first_chapter1 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/Smashwords') my_first_epub.add_chapter(my_first_chapter1) my_first_chapter2 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/Raster_graphics') my_first_epub.add_chapter(my_first_chapter2) my_first_chapter3 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/FBReader') my_first_epub.add_chapter(my_first_chapter3) #my_first_chapter1 = pypub.create_chapter_from_url('https://en.wikipedia.org/wiki/EPUB') #my_first_epub.add_chapter(my_first_chapter1) my_first_epub.create_epub('D:/')