def download_news(request, url, newsname): print('进入新闻页下载的处理函数') FromUserName = get_ACCESS_TOKEN(request) my_first_epub = pypub.Epub(newsname.decode()) # my_first_epub = pypub.Epub('%s' % user) my_first_chapter = pypub.create_chapter_from_url(url) my_first_epub.add_chapter(my_first_chapter) my_first_epub.create_epub('download/' + FromUserName) list = User.objects.filter(username=FromUserName, newsname=newsname) print(list) # list = [] if len(list) != 0: data = '该新闻已被保存,请选择其它新闻。' return render(request, 'test6.html', {'data': data}) else: path = 'download' + '/' + FromUserName + '/' + newsname + '.epub' print(path) with open(path) as f: news_data = f.read() md5str = news_data #m = hashlib.md5(str(time.clock()).encode('utf-8')) #m.hexdigest() m1 = hashlib.md5() m1.update(md5str.decode('latin-1')) token = m1.hexdigest() User.objects.create(username=FromUserName, newsname=newsname, status=False, md5=token) data = '该新闻页保存成功' return render(request, 'test6.html', {'data': data})
def main(): global epub_dir if args.url: url = args.url else: url = input('URL: ').strip() url = process_url(url) #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this # url = "/".join(url.split('/')[:-1]) parsed_uri = urlparse(url) netloc = '{uri.netloc}/'.format(uri=parsed_uri) d_name = slugify(unicode(netloc)) if args.pdf: if (not args.one) and (not os.path.isdir(d_name)): os.makedirs(d_name) ext = '.pdf' else: ext = '.epub' if args.print_date: print('Debugging\n') scrape(url, d_name, ext) elif args.one: d_name = d_name.strip() if args.pdf: fname = d_name + ext else: #.epub will auto suffix fname = d_name + ext fpath = os.path.join(os.getcwd(), fname) while os.path.exists(fpath): fname = d_name + '_' + str(int(time.time())) + ext fpath = os.path.join(os.getcwd(), fname) try: if args.pdf: print('Create single pdf: ' + fpath) pdfkit.from_url(url, fpath) else: import_pypub() tmp_dir = d_name + temp_dir_ext my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir) print('Create single epub: ' + fpath) while True: try: print('\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url) epub_dir = os.path.join(os.getcwd(), tmp_dir) try: my_chapter = pypub.create_chapter_from_url(url) my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() except ValueError as ve: #https://pikachu.com is an invalid url or no network connection print(ve) reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break except IOError as ioe: #should allow next url if requests.get() in pypub's chapter.py timeout print("\nIOError but still allow goto next chapter", ioe) except KeyboardInterrupt: #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls. reply = input( '\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] ' ).strip() #or ctrl+c again also can exit if reply: if reply == 's': reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break elif reply == 'q': break #else #continue/retry except IOError as ioe: print("IOError --one: ", ioe) elif not args.all: print('Download in rss feed mode') if args.feed: url = args.feed #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working # url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25' while url: url = download(url, url, d_name, ext) elif args.single: print('Download single year/month in website mode') download(url, url, d_name, ext) else: print('Download all in website mode') scrape(url, d_name, ext) print("\nDone")
def download(url, h, d_name, ext): global download_once global init_url_once global img_css_style global my_epub global epub_dir if not args.pdf: import_pypub() #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/' visit_link = url orig_url = url if args.all: y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h print("Scraping year... " + y_url) try: r = urlopen(y_url).read() except HTTPError as he: print( '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n' ) clean_up() os._exit(1) if sys.version_info[0] >= 3: r = r.decode('utf-8') t = r.split("'title'") t = t[1:] else: url = process_rss_link(url) print("Scraping rss feed... " + url) r = feedparser.parse( url ) #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr #print(r.headers) t = r['entries'] #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link if (not init_url_once) and ( not t ): #'User does not have permission to read this blog.' of rss feed come here init_url_once = True #parsed_url = urlparse(url) #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'): try: print("Try to scrape rss feed url automatically ... " + orig_url) ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized if sys.version_info[0] >= 3: req = urllib.request.Request(orig_url, data=None, headers={'User-Agent': UA}) r = urllib.request.urlopen(req).read() else: req = urllib2.Request(orig_url, headers={'User-Agent': UA}) r = urllib2.urlopen(req).read() except Exception as e: print(e) print( "Request webpage failed, please check your network OR authorized to access that url." ) clean_up() os._exit( 1 ) #don't use sys.exit(-1) if don't want to traceback to main() to print exception soup = BeautifulSoup(r, "lxml") data = soup.findAll('link', attrs={'type': 'application/rss+xml'}) if not data: #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom data = soup.findAll('link', attrs={'type': 'application/atom+xml'}) if not data: data = soup.findAll('a', attrs={'href': '/rss/' }) #https://blog.google/products/ if data: url = data[0].get("href") url = process_rss_link(url) if url.startswith( '/' ): #http://sectools.org/tag/sploits/ only has href="/feed/" parsed_orig_uri = urlparse(orig_url) url = '{uri.scheme}://{uri.netloc}'.format( uri=parsed_orig_uri) + url print("Scraping rss feed one more time ... " + url) r = feedparser.parse(url) t = r['entries'] if not t: t = [] else: t = [] else: #unlike blogspot, wordpress always got t, so need set true here init_url_once = True parsed_url = urlparse(url) is_wordpress = '{uri.netloc}'.format( uri=parsed_url).endswith('wordpress.com') if not is_wordpress: #only check next if 1st check is False, or lese 2nd check override 1st result try: if 'keys' in dir(r): is_wordpress = r.get('feed', {}).get( 'generator', '').startswith('https://wordpress.org/') except Exception as e: print('parse generator error', e) if is_wordpress and t: #increment paged only if current page got entries, i.e. t #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs if 'paged=' in parsed_url.query: wp_paged_v = int( parsed_url.query[parsed_url.query.rindex('paged=') + len('paged='):]) #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format( uri=parsed_url) + parsed_url.query.replace( 'paged=' + str(wp_paged_v), 'paged=' + str(wp_paged_v + 1)) else: url = '' print('no next') elif ("keys" in dir(r)) and ('link' in r['feed'].keys()): l = r['feed']['links'] if l: got_next = False for ll in l: if ll['rel'] == 'next': #if ll['href'] != url: #don't have next link is same case to test url = ll['href'] got_next = True break if not got_next: url = '' else: url = '' elif not t: #no need care if next page rss index suddenly change and no content case url = '' print_rss_err() count = 0 for tt in t: count += 1 title_raw = '' title_is_link = False if not args.all: #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p') h = '' #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated' post_date = tt.get('published', tt.get('updated', '')) t_date = '' try: if args.locale: if sys.version_info[0] >= 3: t_date = parse_locale(post_date) else: t_date = parse_locale(post_date).decode('utf-8') else: t_date = date_parser.parse(post_date).strftime( '%B %d, %Y, %H:%M %p') except ValueError: #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54 0200' t_date = post_date for feed_links in tt['links']: if feed_links['rel'] == 'alternate': visit_link = feed_links['href'] title_raw = tt['title'].strip() title_pad = title_raw + ' ' if (not args.pdf) or ( not tt['title'] ): #epub got problem copy link from text, so epub always shows link tt['title'] = visit_link title_is_link = True if args.pdf: #pdf with img css causes image not appear at all img_css_style = '' author = tt.get('author_detail', {}).get('name') if not author: author = tt.get('site_name', '') #https://blog.google/rss/ h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[ 'title'] + '</a></i></small><br/><br/></div>' + img_css_style #<hr style="border-top: 1px solid #000000; background: transparent;"> media_content = '' try: if 'media_content' in tt: #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com for tm in tt['media_content']: #pitfall: python 3 dict no has_key() attr if ('medium' in tm) and (tm['medium'] == 'image') and 'url' in tm: media_content += '<img src="' + tm['url'] + '" >' #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >' #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content #... which seems act as single thumbnail on webpage scraping metadata usage only. #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage. #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail # for tm in tt['media_thumbnail']: # if 'url' in tm: # media_content += '<img src="' + tm['url'] + '" >' except Exception as e: print(e) print('parse media error') #pdfkit need specific charset, epub seems no need if args.pdf: #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[ 'summary'].replace( '<div class="separator"', '<div class="separator" align="center" ' ) + media_content + '</div></body>' #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>' else: #epub can't set body/head #h_soup = BeautifulSoup(tt['summary'], "lxml") #for pre in h_soup.find_all('pre'): # print("pre: ", pre) #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div> h = h + '<div align="center">' + tt['summary'].replace( '<br /><br /><br />', '<br />') + media_content + "</div>" #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>" #h = h + tt['summary'] + media_content title = tt['title'] t_url = visit_link else: field = tt.split("'") title = field[1] title_raw = title.strip() t_url = field[5] print('\ntitle: ' + title_raw) print('link: ' + t_url) if args.pdf: print('Download html as PDF, please be patient...' + str(count) + '/' + str(len(t))) else: print('Download html as EPUB, please be patient...' + str(count) + '/' + str(len(t))) if args.pdf: if title_is_link: #else just leave slash with empty title = '/'.join(title.split('/')[-3:]) if sys.version_info[0] >= 3: fname = os.path.join(d_name, slugify(unicode(title))) else: print(title) try: title = title.decode('utf-8') except: pass #print('calm down, is normal decode error') title = replacer(title) #fname = os.path.join( d_name, slugify(title.decode('utf-8'))) fname = os.path.join(d_name, slugify(title)) else: #no point do set fname based on title since epub is single file only with multiple chapters fname = d_name fpath = os.path.join(os.getcwd(), fname) if args.pdf: check_path = os.path.join(fpath + ext) else: check_path = fpath[:-1] + ext if (not download_once) and os.path.exists(check_path): if args.pdf: fpath = fpath + '_' + str(int(time.time())) + ext else: fname = fname[:-1] + ' ' + str(int( time.time())) #pypub truncated _, so can't use '_' else: if args.pdf: fpath += ext else: fpath = fpath[:-1] + ext fname = fname[:-1] if args.pdf: print("file path: " + fpath) #pdf = weasyprint.HTML(t_url).write_pdf() #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf) if args.all: try: pdfkit.from_url(t_url, fpath) except IOError as ioe: print("pdfkit IOError") else: try: #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this #, which you direct unzip .EPUB and open that xhtml will got error #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&` h = replacer(h) pdfkit.from_string(h, fpath) except IOError as ioe: print('Exception IOError: ' + repr(ioe)) else: if not download_once: download_once = True print("file path: " + fpath) if os.path.exists(fname + temp_dir_ext): print( fname + temp_dir_ext + " already exists, please move/backup that direcory to another place manually. Abort" ) #to not blindly replace file os._exit(1) tmp_dir = fname + temp_dir_ext my_epub = pypub.Epub(fname, epub_dir=tmp_dir) epub_dir = os.path.join(os.getcwd(), tmp_dir) print("epub_dir: " + epub_dir) if title_raw: try: title = title.decode('utf-8') except: pass try: #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html title_raw = title_raw.decode('utf-8') except: pass title_raw = replacer(title_raw).replace('&', '&').replace( '<', '<' ).replace( '>', '>' ) #unlike content, title can replace '&'(no space) like that since & may no space #, if content do like that will got no image, got visible text ...etc if args.all: if title_raw: my_chapter = pypub.create_chapter_from_url(title=title_raw, url=t_url) else: #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though my_chapter = pypub.create_chapter_from_url(t_url) #print(my_chapter.content) #my_chapter.content = replacer(my_chapter.content) my_chapter.title = replacer(my_chapter.title) #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error. #try: # my_chapter.content = my_chapter.content.decode('utf-8') #except: # pass #print("decode content err") # # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect ! #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser') try: my_chapter.title = my_chapter.title.decode('utf-8') except: #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html pass #print("decode title err") else: #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace if title_raw: my_chapter = pypub.create_chapter_from_string( h, title=title_raw, url=t_url) else: my_chapter = pypub.create_chapter_from_string( h, title='/'.join(title.split('/')[-3:]), url=t_url) #print(my_chapter.content) #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" ')) my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() return url #return value used for rss feed mode only
def main(): global epub_dir if args.url: url = args.url else: url = input('URL: ').strip() url = process_url(url) #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this # url = "/".join(url.split('/')[:-1]) parsed_uri = urlparse(url) netloc = '{uri.netloc}/'.format(uri=parsed_uri) d_name = slugify(unicode(netloc)) if args.pdf: if (not args.one) and (not os.path.isdir(d_name)): os.makedirs(d_name) ext = '.pdf' else: ext = '.epub' if args.print_date: print('Debugging\n') scrape(url, d_name, ext) elif args.one: d_name = d_name.strip() if args.pdf: fname = d_name + ext else: #.epub will auto suffix fname = d_name + ext fpath = os.path.join(os.getcwd(), fname) while os.path.exists(fpath): fname = d_name + '_' + str(int(time.time())) + ext fpath = os.path.join(os.getcwd(), fname) try: if args.pdf: # [further:0] 'https://thehackernews.com/2019/09/phpmyadmin-csrf-exploit.html' # ... nid -1 -p, can't simply -1 print('Create single pdf: ' + fpath) # test case(need default 3 seconds): https://www.quora.com/Why-does-the-loopback-interface-on-my-computer-has-65536-as-the-MTU-while-other-interfaces-has-1500-as-the-MTU pdfkit.from_url( url, fpath, options={'--javascript-delay': args.js_delay * 1000}) else: import_pypub() tmp_dir = d_name + temp_dir_ext my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir) print('Create single epub: ' + fpath) while True: try: print('\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url) epub_dir = os.path.join(os.getcwd(), tmp_dir) try: ''' import trace #print("sys path: ", sys.prefix, sys.exec_prefix) tracer = trace.Trace( trace=1, #ignoredirs=[sys.prefix, sys.exec_prefix] ) ignoredirs=[ '/usr/lib/python3/', '/usr/lib/python3.6/', '/usr/lib/python3.8/', '/home/xiaobai/.local/lib/python3.6/site-packages/lxml/', ], ignoremods=[ 'version', 'pyparsing', 'six', '_tokenizer', 'serialize', 'exceptions', 'request' , '_inputstream', 'etree', 'html5parser', '_structures', 'specifier', 'specifiers', 'serializer' , '_utils', '_compat' , '_htmlparser', 'element', 'dammit', 'universaldetector', 'codingstatemachine', 'utf8prober' , 'enums', 'mbcsgroupprober', 'charsetgroupprober', 'charsetprober', 'latin1prober' , 'charsetgroupprober', 'sbcharsetprober', 'hebrewprober', 'euctwprober', 'mbcharsetprober' , 'chardistribution', 'sbcsgroupprober', 'jpcntx', 'sjisprober', 'big5prober', 'cp949prober' , 'euckrprober', 'gb2312prober', 'eucjpprober', 'timeout', 'pyopenssl', 'SSL', 'poolmanager' , 'connectionpool', 'response', '_collections', 'core', 'intranges', 'binding', '_oid', 'x509' , 'decode_asn1', 'utils', 'extensions', 'general_name', 'cookies', 'models', 'structures' , '_internal_utils', 'sessions', 'adapters', 'hooks', 'retry' , 'connection', 'api', 'url', 'ssl_' , 'wait', 'crypto', '_util', 'backend', 'makefile' ] #count=1) ) ''' #my_chapter = tracer.runfunc(pypub.create_chapter_from_url, url) my_chapter = pypub.create_chapter_from_url(url) # To replace title contains "&"" to "&" , or else will not able open in kchmviewer # Test case: https://blog.semmle.com/semmle-discovers-severe-vulnerability-ghostscript-postscript-pdf/ my_chapter.title = my_chapter.html_title my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() except ValueError as ve: #https://pikachu.com is an invalid url or no network connection traceback.print_exc() print(ve) try: reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() except EOFError: #when use -1 and < list_of_lines_file, last line will raise EOFError break if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break except IOError as ioe: #should allow next url if requests.get() in pypub's chapter.py timeout print("\nIOError but still allow goto next chapter", ioe) except KeyboardInterrupt: #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls. reply = input( '\n[' + datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] ' ).strip() #or ctrl+c again also can exit if reply: if reply == 's': reply = input( '\nPaste next <url> OR type \'n\' to exit: ' ).strip() if (reply and reply[0].lower() != 'n'): url = process_url(reply) else: break elif reply == 'q': break #else #continue/retry #except Exception, ex: # print('single global ex: ' + ex) except IOError as ioe: print("IOError --one: ", ioe) elif not args.all: print('Download in rss feed mode') if args.feed: url = args.feed #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working # url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25' while url: url = download(url, url, d_name, ext) elif args.single: print('Download single year/month in website mode') download(url, url, d_name, ext) else: print('Download all in website mode') scrape(url, d_name, ext) print("\nDone")
import pypub my_first_epub = pypub.Epub('My Second Epub') my_first_chapter = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/FBReader') my_first_epub.add_chapter(my_first_chapter) my_first_chapter4 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/PocketBook_eReader') my_first_epub.add_chapter(my_first_chapter4) my_first_chapter1 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/Smashwords') my_first_epub.add_chapter(my_first_chapter1) my_first_chapter2 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/Raster_graphics') my_first_epub.add_chapter(my_first_chapter2) my_first_chapter3 = pypub.create_chapter_from_url( 'https://en.wikipedia.org/wiki/FBReader') my_first_epub.add_chapter(my_first_chapter3) #my_first_chapter1 = pypub.create_chapter_from_url('https://en.wikipedia.org/wiki/EPUB') #my_first_epub.add_chapter(my_first_chapter1) my_first_epub.create_epub('D:/')