def main(args): url = args.url log.info("Using URL: {}".format(url)) response = requests.get(url) if args.debug: if not os.path.exists(url.split('/')[-1]): with (url.split('/')[-1], 'w+') as f: f.write(response.content) else: with (url.split('/')[-1], 'r') as f: response = f.read() soup = BeautifulSoup(response.content, 'html5lib') title = soup.find('h1').text log.info("Title is: {}".format(title)) author = soup.find_all('h5')[0].text.strip().replace(u'\xa0', u' ')[3:] log.info("Author is: {}".format(author)) # chapter_titles = soup.findAll('h5', attrs={"class": "modal-title"})[1:] chapter_titles = [ x.text.strip() for x in soup.findAll( 'div', attrs={"class": "alert alert-info xy_alertheader"}) ][:-1] for e in chapter_titles: log.info("Found Chapter: {}".format(str(e.encode('utf-8')))) log.info("Number of chapters found: {}".format(len(chapter_titles))) title_string = "{} - by {}".format(str(title.encode('utf-8').strip()), author) log.info('Book name is: {}'.format(title_string)) epub = pypub.Epub(title_string, creator=author) for num, z in enumerate(soup.findAll('div', attrs={"class": "xy_partbg p-4"}), start=0): try: assert chapter_titles[num] # TODO: URL is no longer correct, need to find method to pull from page log.info('Adding: {}#Part_{}'.format(url, num)) if num == 0: # This is needed to remove the overlay letter from the page c = pypub.create_chapter_from_string( "<h1>Part {}</h1>".format(num + 1) + str(z.find('div', attrs={'class': 'xy_overlaytext'})), title=chapter_titles[num]) else: c = pypub.create_chapter_from_string( "<h1>Part {}</h1>".format(num + 1) + str(z), title=chapter_titles[num]) epub.add_chapter(c) del (c) except ValueError as e: raise ValueError(e) except IndexError: pass output = None if args.output: output = os.path.expanduser(args.output) else: output = os.getcwd() epub.create_epub(output, epub_name=title_string.replace(" ", " "))
def add_chapter_file(href, title): file_path = href.replace('/sre/book/', 'html/') with open(file_path, 'r') as f: contents = f.read() chapter_soup = BeautifulSoup(contents, 'html.parser') chapter_html = chapter_soup.select_one('.content').prettify("utf-8") chapter = pypub.create_chapter_from_string(chapter_html, url=None, title=title) epub.add_chapter(chapter)
def add_chapter_file(href, title): file_path = href.replace('/sre/book/', 'html/') with open(file_path, 'r') as f: contents = f.read() chapter_soup = BeautifulSoup(contents, 'html.parser') chapter_soup = chapter_soup.select_one('.content') links = chapter_soup.select_all('a') for link in links: link.href = link.href.replace('/sre/book/chapters/', '') chapter = pypub.create_chapter_from_string( chapter_html, url=None, title=title) epub.add_chapter(chapter)
def _scrape(self) -> None: """ Downloads the page data and scrapes text. :return: None. """ soup = Scraper().scrape(self.link) for i in soup.find_all("img"): i.decompose() for s in soup.find_all("script"): s.decompose() # TODO: use some subset of these tags for cleaning up end matter # for exclude in ["comment", "button", "tag", # "related", "share", "footer"]: # for section in soup.find_all(id=re.compile(f".*{exclude}.*")): # section.decompose() # for section in soup.find_all(class_=re.compile(f".*{exclude}.*")): # section.decompose() data = str(soup.decode("utf-8", "ignore")) self.contents = pypub.create_chapter_from_string(data, title=self.name)
def create_epub_ch(epub, file): #pypub only accepts html, collect .txt file text and put it into html message = """<html> <head></head> <body>%s</body> </html>""" added_text = [] #from the cleaned (txt files ending in "_fixed.txt") with open(os.path.join("output", file + "_fixed.txt"), "r", encoding='utf-8') as f: for line in f.read().split('\n'): if line: text_adding = "<p>" + line + "</p>" added_text.append(text_adding) message = message % "".join(added_text) #create the chapters chapter = pypub.create_chapter_from_string(message, url=None, title=str("%g" % extract_num(file))) epub.add_chapter(chapter)
def download(url, h, d_name, ext): global download_once global init_url_once global img_css_style global my_epub global epub_dir if not args.pdf: import_pypub() #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/' visit_link = url orig_url = url if args.all: y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h print("Scraping year... " + y_url) try: r = urlopen(y_url).read() except HTTPError as he: print( '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n' ) clean_up() os._exit(1) if sys.version_info[0] >= 3: r = r.decode('utf-8') t = r.split("'title'") t = t[1:] else: url = process_rss_link(url) print("Scraping rss feed... " + url) r = feedparser.parse( url ) #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr #print(r.headers) t = r['entries'] #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link if (not init_url_once) and ( not t ): #'User does not have permission to read this blog.' of rss feed come here init_url_once = True #parsed_url = urlparse(url) #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'): try: print("Try to scrape rss feed url automatically ... " + orig_url) ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized if sys.version_info[0] >= 3: req = urllib.request.Request(orig_url, data=None, headers={'User-Agent': UA}) r = urllib.request.urlopen(req).read() else: req = urllib2.Request(orig_url, headers={'User-Agent': UA}) r = urllib2.urlopen(req).read() except Exception as e: print(e) print( "Request webpage failed, please check your network OR authorized to access that url." ) clean_up() os._exit( 1 ) #don't use sys.exit(-1) if don't want to traceback to main() to print exception soup = BeautifulSoup(r, "lxml") data = soup.findAll('link', attrs={'type': 'application/rss+xml'}) if not data: #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom data = soup.findAll('link', attrs={'type': 'application/atom+xml'}) if not data: data = soup.findAll('a', attrs={'href': '/rss/' }) #https://blog.google/products/ if data: url = data[0].get("href") url = process_rss_link(url) if url.startswith( '/' ): #http://sectools.org/tag/sploits/ only has href="/feed/" parsed_orig_uri = urlparse(orig_url) url = '{uri.scheme}://{uri.netloc}'.format( uri=parsed_orig_uri) + url print("Scraping rss feed one more time ... " + url) r = feedparser.parse(url) t = r['entries'] if not t: t = [] else: t = [] else: #unlike blogspot, wordpress always got t, so need set true here init_url_once = True parsed_url = urlparse(url) is_wordpress = '{uri.netloc}'.format( uri=parsed_url).endswith('wordpress.com') if not is_wordpress: #only check next if 1st check is False, or lese 2nd check override 1st result try: if 'keys' in dir(r): is_wordpress = r.get('feed', {}).get( 'generator', '').startswith('https://wordpress.org/') except Exception as e: print('parse generator error', e) if is_wordpress and t: #increment paged only if current page got entries, i.e. t #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs if 'paged=' in parsed_url.query: wp_paged_v = int( parsed_url.query[parsed_url.query.rindex('paged=') + len('paged='):]) #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format( uri=parsed_url) + parsed_url.query.replace( 'paged=' + str(wp_paged_v), 'paged=' + str(wp_paged_v + 1)) else: url = '' print('no next') elif ("keys" in dir(r)) and ('link' in r['feed'].keys()): l = r['feed']['links'] if l: got_next = False for ll in l: if ll['rel'] == 'next': #if ll['href'] != url: #don't have next link is same case to test url = ll['href'] got_next = True break if not got_next: url = '' else: url = '' elif not t: #no need care if next page rss index suddenly change and no content case url = '' print_rss_err() count = 0 for tt in t: count += 1 title_raw = '' title_is_link = False if not args.all: #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p') h = '' #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated' post_date = tt.get('published', tt.get('updated', '')) t_date = '' try: if args.locale: if sys.version_info[0] >= 3: t_date = parse_locale(post_date) else: t_date = parse_locale(post_date).decode('utf-8') else: t_date = date_parser.parse(post_date).strftime( '%B %d, %Y, %H:%M %p') except ValueError: #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54 0200' t_date = post_date for feed_links in tt['links']: if feed_links['rel'] == 'alternate': visit_link = feed_links['href'] title_raw = tt['title'].strip() title_pad = title_raw + ' ' if (not args.pdf) or ( not tt['title'] ): #epub got problem copy link from text, so epub always shows link tt['title'] = visit_link title_is_link = True if args.pdf: #pdf with img css causes image not appear at all img_css_style = '' author = tt.get('author_detail', {}).get('name') if not author: author = tt.get('site_name', '') #https://blog.google/rss/ h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[ 'title'] + '</a></i></small><br/><br/></div>' + img_css_style #<hr style="border-top: 1px solid #000000; background: transparent;"> media_content = '' try: if 'media_content' in tt: #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com for tm in tt['media_content']: #pitfall: python 3 dict no has_key() attr if ('medium' in tm) and (tm['medium'] == 'image') and 'url' in tm: media_content += '<img src="' + tm['url'] + '" >' #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >' #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content #... which seems act as single thumbnail on webpage scraping metadata usage only. #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage. #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail # for tm in tt['media_thumbnail']: # if 'url' in tm: # media_content += '<img src="' + tm['url'] + '" >' except Exception as e: print(e) print('parse media error') #pdfkit need specific charset, epub seems no need if args.pdf: #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[ 'summary'].replace( '<div class="separator"', '<div class="separator" align="center" ' ) + media_content + '</div></body>' #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>' else: #epub can't set body/head #h_soup = BeautifulSoup(tt['summary'], "lxml") #for pre in h_soup.find_all('pre'): # print("pre: ", pre) #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div> h = h + '<div align="center">' + tt['summary'].replace( '<br /><br /><br />', '<br />') + media_content + "</div>" #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>" #h = h + tt['summary'] + media_content title = tt['title'] t_url = visit_link else: field = tt.split("'") title = field[1] title_raw = title.strip() t_url = field[5] print('\ntitle: ' + title_raw) print('link: ' + t_url) if args.pdf: print('Download html as PDF, please be patient...' + str(count) + '/' + str(len(t))) else: print('Download html as EPUB, please be patient...' + str(count) + '/' + str(len(t))) if args.pdf: if title_is_link: #else just leave slash with empty title = '/'.join(title.split('/')[-3:]) if sys.version_info[0] >= 3: fname = os.path.join(d_name, slugify(unicode(title))) else: print(title) try: title = title.decode('utf-8') except: pass #print('calm down, is normal decode error') title = replacer(title) #fname = os.path.join( d_name, slugify(title.decode('utf-8'))) fname = os.path.join(d_name, slugify(title)) else: #no point do set fname based on title since epub is single file only with multiple chapters fname = d_name fpath = os.path.join(os.getcwd(), fname) if args.pdf: check_path = os.path.join(fpath + ext) else: check_path = fpath[:-1] + ext if (not download_once) and os.path.exists(check_path): if args.pdf: fpath = fpath + '_' + str(int(time.time())) + ext else: fname = fname[:-1] + ' ' + str(int( time.time())) #pypub truncated _, so can't use '_' else: if args.pdf: fpath += ext else: fpath = fpath[:-1] + ext fname = fname[:-1] if args.pdf: print("file path: " + fpath) #pdf = weasyprint.HTML(t_url).write_pdf() #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf) if args.all: try: pdfkit.from_url(t_url, fpath) except IOError as ioe: print("pdfkit IOError") else: try: #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this #, which you direct unzip .EPUB and open that xhtml will got error #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&` h = replacer(h) pdfkit.from_string(h, fpath) except IOError as ioe: print('Exception IOError: ' + repr(ioe)) else: if not download_once: download_once = True print("file path: " + fpath) if os.path.exists(fname + temp_dir_ext): print( fname + temp_dir_ext + " already exists, please move/backup that direcory to another place manually. Abort" ) #to not blindly replace file os._exit(1) tmp_dir = fname + temp_dir_ext my_epub = pypub.Epub(fname, epub_dir=tmp_dir) epub_dir = os.path.join(os.getcwd(), tmp_dir) print("epub_dir: " + epub_dir) if title_raw: try: title = title.decode('utf-8') except: pass try: #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html title_raw = title_raw.decode('utf-8') except: pass title_raw = replacer(title_raw).replace('&', '&').replace( '<', '<' ).replace( '>', '>' ) #unlike content, title can replace '&'(no space) like that since & may no space #, if content do like that will got no image, got visible text ...etc if args.all: if title_raw: my_chapter = pypub.create_chapter_from_url(title=title_raw, url=t_url) else: #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though my_chapter = pypub.create_chapter_from_url(t_url) #print(my_chapter.content) #my_chapter.content = replacer(my_chapter.content) my_chapter.title = replacer(my_chapter.title) #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error. #try: # my_chapter.content = my_chapter.content.decode('utf-8') #except: # pass #print("decode content err") # # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect ! #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser') try: my_chapter.title = my_chapter.title.decode('utf-8') except: #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html pass #print("decode title err") else: #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace if title_raw: my_chapter = pypub.create_chapter_from_string( h, title=title_raw, url=t_url) else: my_chapter = pypub.create_chapter_from_string( h, title='/'.join(title.split('/')[-3:]), url=t_url) #print(my_chapter.content) #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" ')) my_epub.add_chapter(my_chapter) my_epub.create_epub(os.getcwd()) rm_tmp_files() return url #return value used for rss feed mode only
def generate_epub(name, category="", start=None, end=None): """ Main method. """ # Collect post to sort them after posts = {} for page in range(1, 4): sleep_for_a_while() r = requests.get( 'http://highscalability.com/blog/category/%s?currentPage=%s' % (category, page)) html_doc = r.text soup = BeautifulSoup(html_doc, 'html.parser') for post in soup.select(".journal-entry"): #print(post) post_link = post.select(".journal-entry-text h2 a")[0] post_date = post.select(".journal-entry-float-date")[0] # Collect the HREF # Note: the link is useless because the list page contains the full post text. href = post_link.attrs['href'] if not href.startswith("http://highscalability.com"): href = "http://highscalability.com%s" % href # Collect the title title = post_link.get_text() if not title: print("Fail to find the title: %s" % post) # Collect and parse the data date_text = post_date.get_text() # Ex: December 16, 2016 conv = time.strptime(date_text, "%b%d%Y") date_en = time.strftime("%Y-%m-%d", conv) # Ex: 2016-12-16 print(date_en) # Filter according the dates if start and date_en < start: continue if end and date_en >= end: continue print("Processing post %s (%s)" % (title, date_en)) # Collect the content # List pages contain only the beginning of the posts. # We need to retrieve each post page to get the full text sleep_for_a_while() r = requests.get(href) if r.status_code != 200: print("Error: Unable to retrieve blog post content: %s" % r.status_code) break post_doc = r.text post_soup = BeautifulSoup(post_doc, 'html.parser') content = post_soup.select(".journal-entry-text")[0] content_text = u"%s" % (str(content)) # Post are traversed in reverse order posts[date_en] = { "date": date_text, "title": title, "content": content_text } # Sort the post starting from the oldest ordered_posts = collections.OrderedDict(sorted(posts.items())) # Generate the target file epub = pypub.Epub(name) print("Creating the epub...") for date_en, post in ordered_posts.iteritems(): print("Adding post %s" % post["title"]) c = pypub.create_chapter_from_string(post["content"], title=post["title"]) epub.add_chapter(c) sleep_for_a_while() print("Ending epub generation") epub.create_epub(os.getcwd())
for content in json_format["content"]: if results.find('div', class_=content) is not None: chapterText = results.find('div', class_=content) break finalChapterText = "text" chapterTitle = chapterTitle.text.strip() chapterTitle = chapterTitle.replace(' ', ' ') print(chapterTitle) for paragraphs in chapterText: if paragraphs.find('a') is None: finalChapterText = str(paragraphs) my_chapter = pypub.create_chapter_from_string(finalChapterText, url=None, title=chapterTitle) my_epub.add_chapter(my_chapter) # Find the link to the next chapter foundNextChapter = False for a in soup.find_all('a', href=True): tag = a.text tag = tag.replace(u'\xa0', '') tag = tag.replace(' ', '') for listitem in json_format["nextchapter"]: if tag.startswith(listitem): if chapterurl == a['href'].strip(): chapterurl = input( "The listed next page is the same as the current. Please enter the URL for the correct next page: " )