Beispiel #1
0
def main(args):
    url = args.url
    log.info("Using URL: {}".format(url))
    response = requests.get(url)
    if args.debug:
        if not os.path.exists(url.split('/')[-1]):
            with (url.split('/')[-1], 'w+') as f:
                f.write(response.content)
        else:
            with (url.split('/')[-1], 'r') as f:
                response = f.read()
    soup = BeautifulSoup(response.content, 'html5lib')
    title = soup.find('h1').text
    log.info("Title is: {}".format(title))
    author = soup.find_all('h5')[0].text.strip().replace(u'\xa0', u' ')[3:]
    log.info("Author is: {}".format(author))
    # chapter_titles = soup.findAll('h5', attrs={"class": "modal-title"})[1:]
    chapter_titles = [
        x.text.strip() for x in soup.findAll(
            'div', attrs={"class": "alert alert-info xy_alertheader"})
    ][:-1]
    for e in chapter_titles:
        log.info("Found Chapter: {}".format(str(e.encode('utf-8'))))
    log.info("Number of chapters found: {}".format(len(chapter_titles)))
    title_string = "{} - by {}".format(str(title.encode('utf-8').strip()),
                                       author)
    log.info('Book name is: {}'.format(title_string))
    epub = pypub.Epub(title_string, creator=author)
    for num, z in enumerate(soup.findAll('div',
                                         attrs={"class": "xy_partbg p-4"}),
                            start=0):
        try:
            assert chapter_titles[num]
            # TODO: URL is no longer correct, need to find method to pull from page
            log.info('Adding: {}#Part_{}'.format(url, num))
            if num == 0:
                # This is needed to remove the overlay letter from the page
                c = pypub.create_chapter_from_string(
                    "<h1>Part {}</h1>".format(num + 1) +
                    str(z.find('div', attrs={'class': 'xy_overlaytext'})),
                    title=chapter_titles[num])
            else:
                c = pypub.create_chapter_from_string(
                    "<h1>Part {}</h1>".format(num + 1) + str(z),
                    title=chapter_titles[num])
            epub.add_chapter(c)
            del (c)
        except ValueError as e:
            raise ValueError(e)
        except IndexError:
            pass
    output = None
    if args.output:
        output = os.path.expanduser(args.output)
    else:
        output = os.getcwd()
    epub.create_epub(output, epub_name=title_string.replace("  ", " "))
Beispiel #2
0
def add_chapter_file(href, title):
    file_path = href.replace('/sre/book/', 'html/')

    with open(file_path, 'r') as f:
        contents = f.read()
        chapter_soup = BeautifulSoup(contents, 'html.parser')
        chapter_html = chapter_soup.select_one('.content').prettify("utf-8")
        chapter = pypub.create_chapter_from_string(chapter_html,
                                                   url=None,
                                                   title=title)
        epub.add_chapter(chapter)
Beispiel #3
0
def add_chapter_file(href, title):
    file_path = href.replace('/sre/book/', 'html/')

    with open(file_path, 'r') as f:
        contents = f.read()
        chapter_soup = BeautifulSoup(contents, 'html.parser')
        chapter_soup = chapter_soup.select_one('.content')
        links = chapter_soup.select_all('a')
        for link in links:
            link.href = link.href.replace('/sre/book/chapters/', '')
        chapter = pypub.create_chapter_from_string(
            chapter_html, url=None, title=title)
        epub.add_chapter(chapter)
    def _scrape(self) -> None:
        """
        Downloads the page data and scrapes text.
        :return: None.
        """
        soup = Scraper().scrape(self.link)

        for i in soup.find_all("img"):
            i.decompose()
        for s in soup.find_all("script"):
            s.decompose()

        # TODO: use some subset of these tags for cleaning up end matter
        # for exclude in ["comment", "button", "tag",
        #                 "related", "share", "footer"]:
        #     for section in soup.find_all(id=re.compile(f".*{exclude}.*")):
        #         section.decompose()
        #     for section in soup.find_all(class_=re.compile(f".*{exclude}.*")):
        #         section.decompose()

        data = str(soup.decode("utf-8", "ignore"))
        self.contents = pypub.create_chapter_from_string(data, title=self.name)
Beispiel #5
0
def create_epub_ch(epub, file):
    #pypub only accepts html, collect .txt file text and put it into html
    message = """<html>
        <head></head>
        <body>%s</body>
        </html>"""
    added_text = []
    #from the cleaned (txt files ending in "_fixed.txt")
    with open(os.path.join("output", file + "_fixed.txt"),
              "r",
              encoding='utf-8') as f:
        for line in f.read().split('\n'):
            if line:
                text_adding = "<p>" + line + "</p>"
                added_text.append(text_adding)

    message = message % "".join(added_text)
    #create the chapters
    chapter = pypub.create_chapter_from_string(message,
                                               url=None,
                                               title=str("%g" %
                                                         extract_num(file)))
    epub.add_chapter(chapter)
Beispiel #6
0
def download(url, h, d_name, ext):
    global download_once
    global init_url_once
    global img_css_style
    global my_epub
    global epub_dir
    if not args.pdf:
        import_pypub()

    #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/'
    visit_link = url
    orig_url = url
    if args.all:
        y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h
        print("Scraping year... " + y_url)
        try:
            r = urlopen(y_url).read()
        except HTTPError as he:
            print(
                '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n'
            )
            clean_up()
            os._exit(1)
        if sys.version_info[0] >= 3:
            r = r.decode('utf-8')
        t = r.split("'title'")
        t = t[1:]
    else:
        url = process_rss_link(url)
        print("Scraping rss feed... " + url)
        r = feedparser.parse(
            url
        )  #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr
        #print(r.headers)
        t = r['entries']
        #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link
        if (not init_url_once) and (
                not t
        ):  #'User does not have permission to read this blog.' of rss feed come here
            init_url_once = True
            #parsed_url = urlparse(url)
            #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'):
            try:
                print("Try to scrape rss feed url automatically ... " +
                      orig_url)
                ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized
                if sys.version_info[0] >= 3:
                    req = urllib.request.Request(orig_url,
                                                 data=None,
                                                 headers={'User-Agent': UA})
                    r = urllib.request.urlopen(req).read()
                else:
                    req = urllib2.Request(orig_url, headers={'User-Agent': UA})
                    r = urllib2.urlopen(req).read()
            except Exception as e:
                print(e)
                print(
                    "Request webpage failed, please check your network OR authorized to access that url."
                )
                clean_up()
                os._exit(
                    1
                )  #don't use sys.exit(-1) if don't want to traceback to main() to print exception
            soup = BeautifulSoup(r, "lxml")
            data = soup.findAll('link', attrs={'type': 'application/rss+xml'})
            if not data:  #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom
                data = soup.findAll('link',
                                    attrs={'type': 'application/atom+xml'})
            if not data:
                data = soup.findAll('a',
                                    attrs={'href': '/rss/'
                                           })  #https://blog.google/products/
            if data:
                url = data[0].get("href")
                url = process_rss_link(url)
                if url.startswith(
                        '/'
                ):  #http://sectools.org/tag/sploits/ only has href="/feed/"
                    parsed_orig_uri = urlparse(orig_url)
                    url = '{uri.scheme}://{uri.netloc}'.format(
                        uri=parsed_orig_uri) + url
                print("Scraping rss feed one more time ... " + url)
                r = feedparser.parse(url)
                t = r['entries']
                if not t:
                    t = []
            else:
                t = []
        else:  #unlike blogspot, wordpress always got t, so need set true here
            init_url_once = True
        parsed_url = urlparse(url)
        is_wordpress = '{uri.netloc}'.format(
            uri=parsed_url).endswith('wordpress.com')
        if not is_wordpress:  #only check next if 1st check is False, or lese 2nd check override 1st result
            try:
                if 'keys' in dir(r):
                    is_wordpress = r.get('feed', {}).get(
                        'generator', '').startswith('https://wordpress.org/')
            except Exception as e:
                print('parse generator error', e)
        if is_wordpress and t:  #increment paged only if current page got entries, i.e. t
            #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs
            if 'paged=' in parsed_url.query:
                wp_paged_v = int(
                    parsed_url.query[parsed_url.query.rindex('paged=') +
                                     len('paged='):])
                #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page
                url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format(
                    uri=parsed_url) + parsed_url.query.replace(
                        'paged=' + str(wp_paged_v),
                        'paged=' + str(wp_paged_v + 1))
            else:
                url = ''
                print('no next')
        elif ("keys" in dir(r)) and ('link' in r['feed'].keys()):
            l = r['feed']['links']
            if l:
                got_next = False
                for ll in l:
                    if ll['rel'] == 'next':
                        #if ll['href'] != url: #don't have next link is same case to test
                        url = ll['href']
                        got_next = True
                        break
                if not got_next:
                    url = ''
            else:
                url = ''
        elif not t:  #no need care if next page rss index suddenly change and no content case
            url = ''
            print_rss_err()

    count = 0
    for tt in t:
        count += 1
        title_raw = ''
        title_is_link = False
        if not args.all:
            #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p')
            h = ''
            #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated'
            post_date = tt.get('published', tt.get('updated', ''))
            t_date = ''
            try:
                if args.locale:
                    if sys.version_info[0] >= 3:
                        t_date = parse_locale(post_date)
                    else:
                        t_date = parse_locale(post_date).decode('utf-8')
                else:
                    t_date = date_parser.parse(post_date).strftime(
                        '%B %d, %Y, %H:%M %p')
            except ValueError:  #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54  0200'
                t_date = post_date
            for feed_links in tt['links']:
                if feed_links['rel'] == 'alternate':
                    visit_link = feed_links['href']
            title_raw = tt['title'].strip()
            title_pad = title_raw + ' '
            if (not args.pdf) or (
                    not tt['title']
            ):  #epub got problem copy link from text, so epub always shows link
                tt['title'] = visit_link
                title_is_link = True
            if args.pdf:  #pdf with img css causes image not appear at all
                img_css_style = ''

            author = tt.get('author_detail', {}).get('name')
            if not author:
                author = tt.get('site_name', '')  #https://blog.google/rss/

            h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[
                'title'] + '</a></i></small><br/><br/></div>' + img_css_style
            #<hr style="border-top: 1px solid #000000; background: transparent;">

            media_content = ''
            try:
                if 'media_content' in tt:  #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com
                    for tm in tt['media_content']:
                        #pitfall: python 3 dict no has_key() attr
                        if ('medium' in tm) and (tm['medium']
                                                 == 'image') and 'url' in tm:
                            media_content += '<img src="' + tm['url'] + '" >'
                            #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >'
                #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content
                #... which seems act as single thumbnail on webpage scraping metadata usage only.
                #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage.
                #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail
                #    for tm in tt['media_thumbnail']:
                #        if 'url' in tm:
                #            media_content += '<img src="' + tm['url'] + '" >'
            except Exception as e:
                print(e)
                print('parse media error')

            #pdfkit need specific charset, epub seems no need
            if args.pdf:  #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol
                h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[
                    'summary'].replace(
                        '<div class="separator"',
                        '<div class="separator" align="center" '
                    ) + media_content + '</div></body>'
                #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>'
            else:  #epub can't set body/head
                #h_soup = BeautifulSoup(tt['summary'], "lxml")
                #for pre in h_soup.find_all('pre'):
                #    print("pre: ", pre)
                #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div>
                h = h + '<div align="center">' + tt['summary'].replace(
                    '<br /><br /><br />', '<br />') + media_content + "</div>"
                #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>"
                #h = h + tt['summary'] + media_content
            title = tt['title']
            t_url = visit_link
        else:
            field = tt.split("'")
            title = field[1]
            title_raw = title.strip()
            t_url = field[5]
        print('\ntitle: ' + title_raw)
        print('link: ' + t_url)
        if args.pdf:
            print('Download html as PDF, please be patient...' + str(count) +
                  '/' + str(len(t)))
        else:
            print('Download html as EPUB, please be patient...' + str(count) +
                  '/' + str(len(t)))
        if args.pdf:
            if title_is_link:  #else just leave slash with empty
                title = '/'.join(title.split('/')[-3:])
            if sys.version_info[0] >= 3:
                fname = os.path.join(d_name, slugify(unicode(title)))
            else:
                print(title)
                try:
                    title = title.decode('utf-8')
                except:
                    pass  #print('calm down, is normal decode error')
                title = replacer(title)
                #fname = os.path.join( d_name, slugify(title.decode('utf-8')))
                fname = os.path.join(d_name, slugify(title))
        else:  #no point do set fname based on title since epub is single file only with multiple chapters
            fname = d_name
        fpath = os.path.join(os.getcwd(), fname)
        if args.pdf:
            check_path = os.path.join(fpath + ext)
        else:
            check_path = fpath[:-1] + ext
        if (not download_once) and os.path.exists(check_path):
            if args.pdf:
                fpath = fpath + '_' + str(int(time.time())) + ext
            else:
                fname = fname[:-1] + ' ' + str(int(
                    time.time()))  #pypub truncated _, so can't use '_'
        else:
            if args.pdf:
                fpath += ext
            else:
                fpath = fpath[:-1] + ext
                fname = fname[:-1]
        if args.pdf:
            print("file path: " + fpath)
            #pdf = weasyprint.HTML(t_url).write_pdf()
            #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf)
            if args.all:
                try:
                    pdfkit.from_url(t_url, fpath)
                except IOError as ioe:
                    print("pdfkit IOError")
            else:
                try:
                    #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this
                    #, which you direct unzip .EPUB and open that xhtml will got error
                    #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test
                    #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography
                    #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&amp;`
                    h = replacer(h)
                    pdfkit.from_string(h, fpath)
                except IOError as ioe:
                    print('Exception IOError: ' + repr(ioe))
        else:
            if not download_once:
                download_once = True
                print("file path: " + fpath)
                if os.path.exists(fname + temp_dir_ext):
                    print(
                        fname + temp_dir_ext +
                        " already exists, please move/backup that direcory to another place manually. Abort"
                    )  #to not blindly replace file
                    os._exit(1)
                tmp_dir = fname + temp_dir_ext
                my_epub = pypub.Epub(fname, epub_dir=tmp_dir)
                epub_dir = os.path.join(os.getcwd(), tmp_dir)
                print("epub_dir: " + epub_dir)
            if title_raw:
                try:
                    title = title.decode('utf-8')
                except:
                    pass
                try:  #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html
                    title_raw = title_raw.decode('utf-8')
                except:
                    pass
                title_raw = replacer(title_raw).replace('&', '&amp;').replace(
                    '<', '&lt;'
                ).replace(
                    '>', '&gt;'
                )  #unlike content, title can replace '&'(no space) like that since & may no space
                #, if content do like that will got no image, got visible &nbsp; text ...etc
            if args.all:
                if title_raw:
                    my_chapter = pypub.create_chapter_from_url(title=title_raw,
                                                               url=t_url)
                else:  #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though
                    my_chapter = pypub.create_chapter_from_url(t_url)
                #print(my_chapter.content)
                #my_chapter.content = replacer(my_chapter.content)
                my_chapter.title = replacer(my_chapter.title)
                #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add  <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error.
                #try:
                #    my_chapter.content = my_chapter.content.decode('utf-8')
                #except:
                #    pass #print("decode content err")
                #
                # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect !
                #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser')

                try:
                    my_chapter.title = my_chapter.title.decode('utf-8')
                except:  #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html
                    pass  #print("decode title err")
            else:
                #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace
                if title_raw:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title=title_raw, url=t_url)
                else:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title='/'.join(title.split('/')[-3:]), url=t_url)
                #print(my_chapter.content)
                #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" '))
            my_epub.add_chapter(my_chapter)
            my_epub.create_epub(os.getcwd())
            rm_tmp_files()
    return url  #return value used for rss feed mode only
def generate_epub(name, category="", start=None, end=None):
    """
    Main method.
    """

    # Collect post to sort them after
    posts = {}

    for page in range(1, 4):

        sleep_for_a_while()
        r = requests.get(
            'http://highscalability.com/blog/category/%s?currentPage=%s' %
            (category, page))
        html_doc = r.text

        soup = BeautifulSoup(html_doc, 'html.parser')
        for post in soup.select(".journal-entry"):
            #print(post)

            post_link = post.select(".journal-entry-text h2 a")[0]
            post_date = post.select(".journal-entry-float-date")[0]

            # Collect the HREF
            # Note: the link is useless because the list page contains the full post text.
            href = post_link.attrs['href']
            if not href.startswith("http://highscalability.com"):
                href = "http://highscalability.com%s" % href

            # Collect the title
            title = post_link.get_text()

            if not title:
                print("Fail to find the title: %s" % post)

            # Collect and parse the data
            date_text = post_date.get_text()  # Ex: December 16, 2016
            conv = time.strptime(date_text, "%b%d%Y")
            date_en = time.strftime("%Y-%m-%d", conv)  # Ex: 2016-12-16
            print(date_en)

            # Filter according the dates
            if start and date_en < start:
                continue
            if end and date_en >= end:
                continue

            print("Processing post %s (%s)" % (title, date_en))

            # Collect the content
            # List pages contain only the beginning of the posts.
            # We need to retrieve each post page to get the full text
            sleep_for_a_while()
            r = requests.get(href)
            if r.status_code != 200:
                print("Error: Unable to retrieve blog post content: %s" %
                      r.status_code)
                break

            post_doc = r.text
            post_soup = BeautifulSoup(post_doc, 'html.parser')
            content = post_soup.select(".journal-entry-text")[0]

            content_text = u"%s" % (str(content))

            # Post are traversed in reverse order
            posts[date_en] = {
                "date": date_text,
                "title": title,
                "content": content_text
            }

    # Sort the post starting from the oldest
    ordered_posts = collections.OrderedDict(sorted(posts.items()))

    # Generate the target file
    epub = pypub.Epub(name)
    print("Creating the epub...")
    for date_en, post in ordered_posts.iteritems():
        print("Adding post %s" % post["title"])
        c = pypub.create_chapter_from_string(post["content"],
                                             title=post["title"])
        epub.add_chapter(c)
        sleep_for_a_while()
    print("Ending epub generation")
    epub.create_epub(os.getcwd())
Beispiel #8
0
            for content in json_format["content"]:
                if results.find('div', class_=content) is not None:
                    chapterText = results.find('div', class_=content)
                    break

            finalChapterText = "text"
            chapterTitle = chapterTitle.text.strip()
            chapterTitle = chapterTitle.replace('  ', ' ')
            print(chapterTitle)

            for paragraphs in chapterText:
                if paragraphs.find('a') is None:
                    finalChapterText = str(paragraphs)

            my_chapter = pypub.create_chapter_from_string(finalChapterText,
                                                          url=None,
                                                          title=chapterTitle)
            my_epub.add_chapter(my_chapter)

        # Find the link to the next chapter
        foundNextChapter = False
        for a in soup.find_all('a', href=True):
            tag = a.text
            tag = tag.replace(u'\xa0', '')
            tag = tag.replace(' ', '')
            for listitem in json_format["nextchapter"]:
                if tag.startswith(listitem):
                    if chapterurl == a['href'].strip():
                        chapterurl = input(
                            "The listed next page is the same as the current. Please enter the URL for the correct next page: "
                        )