Beispiel #1
0
def download_news(request, url, newsname):
    print('进入新闻页下载的处理函数')
    FromUserName = get_ACCESS_TOKEN(request)

    my_first_epub = pypub.Epub(newsname.decode())
    # my_first_epub = pypub.Epub('%s' % user)

    my_first_chapter = pypub.create_chapter_from_url(url)

    my_first_epub.add_chapter(my_first_chapter)

    my_first_epub.create_epub('download/' + FromUserName)
    list = User.objects.filter(username=FromUserName, newsname=newsname)
    print(list)
    # list = []
    if len(list) != 0:
        data = '该新闻已被保存,请选择其它新闻。'
        return render(request, 'test6.html', {'data': data})
    else:
        path = 'download' + '/' + FromUserName + '/' + newsname + '.epub'
        print(path)
        with open(path) as f:
            news_data = f.read()
        md5str = news_data
        #m = hashlib.md5(str(time.clock()).encode('utf-8'))
        #m.hexdigest()
        m1 = hashlib.md5()
        m1.update(md5str.decode('latin-1'))
        token = m1.hexdigest()
        User.objects.create(username=FromUserName,
                            newsname=newsname,
                            status=False,
                            md5=token)
        data = '该新闻页保存成功'
        return render(request, 'test6.html', {'data': data})
Beispiel #2
0
def open_file(singlelined=()):
    #opens each file for processing
    del_file()

    #adds every .txt file not called patterns.txt as chapters in a book
    filelist = sorted([
        file for file in os.listdir(os.getcwd())
        if file.endswith(".txt") and file != "patterns.txt"
    ],
                      key=extract_num)
    EPUBNAME = filelist[0]
    #creates an epub
    epub = pypub.Epub(EPUBNAME)
    for file in filelist:
        if file in singlelined:
            #on some protected google drive docs, there are no paragraph breaks so the text is a solid wall of text
            #if singleline is true, the program will look for punctuation at the end
            parse_file(file, singleline=True)
        else:
            parse_file(file, singleline=False)
        #creates a chapter from each .txt file
        create_epub_ch(epub, file)
    paths = os.getcwd()
    epub.create_epub(paths)
    EPUBNAME = filelist[0] + ".epub"
    return EPUBNAME
Beispiel #3
0
def main(args):
    url = args.url
    log.info("Using URL: {}".format(url))
    response = requests.get(url)
    if args.debug:
        if not os.path.exists(url.split('/')[-1]):
            with (url.split('/')[-1], 'w+') as f:
                f.write(response.content)
        else:
            with (url.split('/')[-1], 'r') as f:
                response = f.read()
    soup = BeautifulSoup(response.content, 'html5lib')
    title = soup.find('h1').text
    log.info("Title is: {}".format(title))
    author = soup.find_all('h5')[0].text.strip().replace(u'\xa0', u' ')[3:]
    log.info("Author is: {}".format(author))
    # chapter_titles = soup.findAll('h5', attrs={"class": "modal-title"})[1:]
    chapter_titles = [
        x.text.strip() for x in soup.findAll(
            'div', attrs={"class": "alert alert-info xy_alertheader"})
    ][:-1]
    for e in chapter_titles:
        log.info("Found Chapter: {}".format(str(e.encode('utf-8'))))
    log.info("Number of chapters found: {}".format(len(chapter_titles)))
    title_string = "{} - by {}".format(str(title.encode('utf-8').strip()),
                                       author)
    log.info('Book name is: {}'.format(title_string))
    epub = pypub.Epub(title_string, creator=author)
    for num, z in enumerate(soup.findAll('div',
                                         attrs={"class": "xy_partbg p-4"}),
                            start=0):
        try:
            assert chapter_titles[num]
            # TODO: URL is no longer correct, need to find method to pull from page
            log.info('Adding: {}#Part_{}'.format(url, num))
            if num == 0:
                # This is needed to remove the overlay letter from the page
                c = pypub.create_chapter_from_string(
                    "<h1>Part {}</h1>".format(num + 1) +
                    str(z.find('div', attrs={'class': 'xy_overlaytext'})),
                    title=chapter_titles[num])
            else:
                c = pypub.create_chapter_from_string(
                    "<h1>Part {}</h1>".format(num + 1) + str(z),
                    title=chapter_titles[num])
            epub.add_chapter(c)
            del (c)
        except ValueError as e:
            raise ValueError(e)
        except IndexError:
            pass
    output = None
    if args.output:
        output = os.path.expanduser(args.output)
    else:
        output = os.getcwd()
    epub.create_epub(output, epub_name=title_string.replace("  ", " "))
    def collect(self, path: str, no_cache: bool = False) -> str:
        """
        Scrapes a novel to a given output directory.
        :param path: The output directory.
        :param no_cache: Whether to force rescraping of cached chapters.
        :return: The name of the EPUB file.
        """
        epub = pypub.Epub(
            self.metadata["title"],
            creator=", ".join(self.metadata["authors"] +
                              self.metadata["artists"]),
            language=", ".join(self.metadata["languages"]),
            rights=self.metadata["licensed"],
            publisher=", ".join(self.metadata["original_publishers"] +
                                self.metadata["english_publishers"]))
        translator_dict = defaultdict(lambda: [])
        for i, p in enumerate(self.chapter_list):
            translator_dict[p.translator].append((i, p))
        chapter_data = []
        chapters = set()
        for t in sorted(list(translator_dict.values()), key=len, reverse=True):
            for i, p in t:
                if p.name not in chapters:
                    chapter_data.append((i, p))
                    chapters.add(p.name)
        chapter_data.sort(key=lambda x: x[0])

        with ThreadPool(THREAD_COUNT) as p:
            pages = p.map(lambda x: x[1].get(no_cache=no_cache), chapter_data)

        for c in pages:
            epub.add_chapter(c)

        # TODO: Submit PR to pypub fork and replace this atrocious workaround
        # Replace open function temporarily to affect library behavior
        old_open = open

        def new_open(*args, **kwargs):
            utf8_open = partial(old_open, encoding="utf-8")
            try:
                return utf8_open(*args, **kwargs)
            except ValueError:
                return old_open(*args, **kwargs)

        builtins.open = new_open

        epub.create_epub(path)

        # Restore old open function
        builtins.open = old_open

        return f"{epub.title}.epub"
Beispiel #5
0
def _create_epub_single(files, output, title):
    import pypub
    creator = "Anonymous"
    language = 'cn'
    rights = now()
    publisher = 'Anonymous'
    print('Creating epub "%s" include %s chapters' % (title, len(files)))
    book = pypub.Epub(title, creator=creator,
                      language=language, rights=rights,
                      publisher=publisher)
    for file in files:
        name = os.path.basename(file)
        c_title = os.path.splitext(name)[0]
        c_file = file
        book.add_chapter(pypub.create_chapter_from_file(c_file, c_title))
    book.create_epub(output, epub_name=title)
Beispiel #6
0
from bs4 import BeautifulSoup
import os
import pypub

epub = pypub.Epub('Site Reliability Engineering')


def setup_toc():
    soup = BeautifulSoup(open('./html/index.html'), 'html.parser')
    links = soup.select('.content a ')
    for link in links:
        print(link['href'])
        add_chapter_file(link['href'], link.get_text())

    epub.create_epub(os.path.abspath('./build'))


def add_chapter_file(href, title):
    file_path = href.replace('/sre/book/', 'html/')

    with open(file_path, 'r') as f:
        contents = f.read()
        chapter_soup = BeautifulSoup(contents, 'html.parser')
        chapter_soup = chapter_soup.select_one('.content')
        links = chapter_soup.select_all('a')
        for link in links:
            link.href = link.href.replace('/sre/book/chapters/', '')
        chapter = pypub.create_chapter_from_string(
            chapter_html, url=None, title=title)
        epub.add_chapter(chapter)
Beispiel #7
0
def main():
    global epub_dir
    if args.url:
        url = args.url
    else:
        url = input('URL: ').strip()
    url = process_url(url)
    #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this
    #    url = "/".join(url.split('/')[:-1])
    parsed_uri = urlparse(url)
    netloc = '{uri.netloc}/'.format(uri=parsed_uri)
    d_name = slugify(unicode(netloc))
    if args.pdf:
        if (not args.one) and (not os.path.isdir(d_name)):
            os.makedirs(d_name)
        ext = '.pdf'
    else:
        ext = '.epub'
    if args.print_date:
        print('Debugging\n')
        scrape(url, d_name, ext)
    elif args.one:
        d_name = d_name.strip()
        if args.pdf:
            fname = d_name + ext
        else:  #.epub will auto suffix
            fname = d_name + ext
        fpath = os.path.join(os.getcwd(), fname)
        while os.path.exists(fpath):
            fname = d_name + '_' + str(int(time.time())) + ext
            fpath = os.path.join(os.getcwd(), fname)
        try:
            if args.pdf:
                print('Create single pdf: ' + fpath)
                pdfkit.from_url(url, fpath)
            else:
                import_pypub()
                tmp_dir = d_name + temp_dir_ext
                my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir)
                print('Create single epub: ' + fpath)
                while True:
                    try:
                        print('\n[' + datetime.datetime.now().strftime(
                            "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url)
                        epub_dir = os.path.join(os.getcwd(), tmp_dir)
                        try:
                            my_chapter = pypub.create_chapter_from_url(url)
                            my_epub.add_chapter(my_chapter)
                            my_epub.create_epub(os.getcwd())
                            rm_tmp_files()
                        except ValueError as ve:  #https://pikachu.com is an invalid url or no network connection
                            print(ve)
                        reply = input(
                            '\nPaste next <url> OR type \'n\' to exit: '
                        ).strip()
                        if (reply and reply[0].lower() != 'n'):
                            url = process_url(reply)
                        else:
                            break
                    except IOError as ioe:  #should allow next url if requests.get() in pypub's chapter.py timeout
                        print("\nIOError but still allow goto next chapter",
                              ioe)
                    except KeyboardInterrupt:
                        #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls.
                        reply = input(
                            '\n[' + datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") +
                            '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] '
                        ).strip()  #or ctrl+c again also can exit
                        if reply:
                            if reply == 's':
                                reply = input(
                                    '\nPaste next <url> OR type \'n\' to exit: '
                                ).strip()
                                if (reply and reply[0].lower() != 'n'):
                                    url = process_url(reply)
                                else:
                                    break
                            elif reply == 'q':
                                break
                            #else #continue/retry
        except IOError as ioe:
            print("IOError --one: ", ioe)
    elif not args.all:
        print('Download in rss feed mode')
        if args.feed:
            url = args.feed
        #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working
        #    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25'
        while url:
            url = download(url, url, d_name, ext)
    elif args.single:
        print('Download single year/month in website mode')
        download(url, url, d_name, ext)
    else:
        print('Download all in website mode')
        scrape(url, d_name, ext)
    print("\nDone")
Beispiel #8
0
def download(url, h, d_name, ext):
    global download_once
    global init_url_once
    global img_css_style
    global my_epub
    global epub_dir
    if not args.pdf:
        import_pypub()

    #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/'
    visit_link = url
    orig_url = url
    if args.all:
        y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h
        print("Scraping year... " + y_url)
        try:
            r = urlopen(y_url).read()
        except HTTPError as he:
            print(
                '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n'
            )
            clean_up()
            os._exit(1)
        if sys.version_info[0] >= 3:
            r = r.decode('utf-8')
        t = r.split("'title'")
        t = t[1:]
    else:
        url = process_rss_link(url)
        print("Scraping rss feed... " + url)
        r = feedparser.parse(
            url
        )  #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr
        #print(r.headers)
        t = r['entries']
        #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link
        if (not init_url_once) and (
                not t
        ):  #'User does not have permission to read this blog.' of rss feed come here
            init_url_once = True
            #parsed_url = urlparse(url)
            #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'):
            try:
                print("Try to scrape rss feed url automatically ... " +
                      orig_url)
                ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized
                if sys.version_info[0] >= 3:
                    req = urllib.request.Request(orig_url,
                                                 data=None,
                                                 headers={'User-Agent': UA})
                    r = urllib.request.urlopen(req).read()
                else:
                    req = urllib2.Request(orig_url, headers={'User-Agent': UA})
                    r = urllib2.urlopen(req).read()
            except Exception as e:
                print(e)
                print(
                    "Request webpage failed, please check your network OR authorized to access that url."
                )
                clean_up()
                os._exit(
                    1
                )  #don't use sys.exit(-1) if don't want to traceback to main() to print exception
            soup = BeautifulSoup(r, "lxml")
            data = soup.findAll('link', attrs={'type': 'application/rss+xml'})
            if not data:  #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom
                data = soup.findAll('link',
                                    attrs={'type': 'application/atom+xml'})
            if not data:
                data = soup.findAll('a',
                                    attrs={'href': '/rss/'
                                           })  #https://blog.google/products/
            if data:
                url = data[0].get("href")
                url = process_rss_link(url)
                if url.startswith(
                        '/'
                ):  #http://sectools.org/tag/sploits/ only has href="/feed/"
                    parsed_orig_uri = urlparse(orig_url)
                    url = '{uri.scheme}://{uri.netloc}'.format(
                        uri=parsed_orig_uri) + url
                print("Scraping rss feed one more time ... " + url)
                r = feedparser.parse(url)
                t = r['entries']
                if not t:
                    t = []
            else:
                t = []
        else:  #unlike blogspot, wordpress always got t, so need set true here
            init_url_once = True
        parsed_url = urlparse(url)
        is_wordpress = '{uri.netloc}'.format(
            uri=parsed_url).endswith('wordpress.com')
        if not is_wordpress:  #only check next if 1st check is False, or lese 2nd check override 1st result
            try:
                if 'keys' in dir(r):
                    is_wordpress = r.get('feed', {}).get(
                        'generator', '').startswith('https://wordpress.org/')
            except Exception as e:
                print('parse generator error', e)
        if is_wordpress and t:  #increment paged only if current page got entries, i.e. t
            #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs
            if 'paged=' in parsed_url.query:
                wp_paged_v = int(
                    parsed_url.query[parsed_url.query.rindex('paged=') +
                                     len('paged='):])
                #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page
                url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format(
                    uri=parsed_url) + parsed_url.query.replace(
                        'paged=' + str(wp_paged_v),
                        'paged=' + str(wp_paged_v + 1))
            else:
                url = ''
                print('no next')
        elif ("keys" in dir(r)) and ('link' in r['feed'].keys()):
            l = r['feed']['links']
            if l:
                got_next = False
                for ll in l:
                    if ll['rel'] == 'next':
                        #if ll['href'] != url: #don't have next link is same case to test
                        url = ll['href']
                        got_next = True
                        break
                if not got_next:
                    url = ''
            else:
                url = ''
        elif not t:  #no need care if next page rss index suddenly change and no content case
            url = ''
            print_rss_err()

    count = 0
    for tt in t:
        count += 1
        title_raw = ''
        title_is_link = False
        if not args.all:
            #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p')
            h = ''
            #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated'
            post_date = tt.get('published', tt.get('updated', ''))
            t_date = ''
            try:
                if args.locale:
                    if sys.version_info[0] >= 3:
                        t_date = parse_locale(post_date)
                    else:
                        t_date = parse_locale(post_date).decode('utf-8')
                else:
                    t_date = date_parser.parse(post_date).strftime(
                        '%B %d, %Y, %H:%M %p')
            except ValueError:  #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54  0200'
                t_date = post_date
            for feed_links in tt['links']:
                if feed_links['rel'] == 'alternate':
                    visit_link = feed_links['href']
            title_raw = tt['title'].strip()
            title_pad = title_raw + ' '
            if (not args.pdf) or (
                    not tt['title']
            ):  #epub got problem copy link from text, so epub always shows link
                tt['title'] = visit_link
                title_is_link = True
            if args.pdf:  #pdf with img css causes image not appear at all
                img_css_style = ''

            author = tt.get('author_detail', {}).get('name')
            if not author:
                author = tt.get('site_name', '')  #https://blog.google/rss/

            h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[
                'title'] + '</a></i></small><br/><br/></div>' + img_css_style
            #<hr style="border-top: 1px solid #000000; background: transparent;">

            media_content = ''
            try:
                if 'media_content' in tt:  #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com
                    for tm in tt['media_content']:
                        #pitfall: python 3 dict no has_key() attr
                        if ('medium' in tm) and (tm['medium']
                                                 == 'image') and 'url' in tm:
                            media_content += '<img src="' + tm['url'] + '" >'
                            #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >'
                #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content
                #... which seems act as single thumbnail on webpage scraping metadata usage only.
                #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage.
                #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail
                #    for tm in tt['media_thumbnail']:
                #        if 'url' in tm:
                #            media_content += '<img src="' + tm['url'] + '" >'
            except Exception as e:
                print(e)
                print('parse media error')

            #pdfkit need specific charset, epub seems no need
            if args.pdf:  #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol
                h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[
                    'summary'].replace(
                        '<div class="separator"',
                        '<div class="separator" align="center" '
                    ) + media_content + '</div></body>'
                #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>'
            else:  #epub can't set body/head
                #h_soup = BeautifulSoup(tt['summary'], "lxml")
                #for pre in h_soup.find_all('pre'):
                #    print("pre: ", pre)
                #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div>
                h = h + '<div align="center">' + tt['summary'].replace(
                    '<br /><br /><br />', '<br />') + media_content + "</div>"
                #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>"
                #h = h + tt['summary'] + media_content
            title = tt['title']
            t_url = visit_link
        else:
            field = tt.split("'")
            title = field[1]
            title_raw = title.strip()
            t_url = field[5]
        print('\ntitle: ' + title_raw)
        print('link: ' + t_url)
        if args.pdf:
            print('Download html as PDF, please be patient...' + str(count) +
                  '/' + str(len(t)))
        else:
            print('Download html as EPUB, please be patient...' + str(count) +
                  '/' + str(len(t)))
        if args.pdf:
            if title_is_link:  #else just leave slash with empty
                title = '/'.join(title.split('/')[-3:])
            if sys.version_info[0] >= 3:
                fname = os.path.join(d_name, slugify(unicode(title)))
            else:
                print(title)
                try:
                    title = title.decode('utf-8')
                except:
                    pass  #print('calm down, is normal decode error')
                title = replacer(title)
                #fname = os.path.join( d_name, slugify(title.decode('utf-8')))
                fname = os.path.join(d_name, slugify(title))
        else:  #no point do set fname based on title since epub is single file only with multiple chapters
            fname = d_name
        fpath = os.path.join(os.getcwd(), fname)
        if args.pdf:
            check_path = os.path.join(fpath + ext)
        else:
            check_path = fpath[:-1] + ext
        if (not download_once) and os.path.exists(check_path):
            if args.pdf:
                fpath = fpath + '_' + str(int(time.time())) + ext
            else:
                fname = fname[:-1] + ' ' + str(int(
                    time.time()))  #pypub truncated _, so can't use '_'
        else:
            if args.pdf:
                fpath += ext
            else:
                fpath = fpath[:-1] + ext
                fname = fname[:-1]
        if args.pdf:
            print("file path: " + fpath)
            #pdf = weasyprint.HTML(t_url).write_pdf()
            #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf)
            if args.all:
                try:
                    pdfkit.from_url(t_url, fpath)
                except IOError as ioe:
                    print("pdfkit IOError")
            else:
                try:
                    #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this
                    #, which you direct unzip .EPUB and open that xhtml will got error
                    #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test
                    #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography
                    #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&amp;`
                    h = replacer(h)
                    pdfkit.from_string(h, fpath)
                except IOError as ioe:
                    print('Exception IOError: ' + repr(ioe))
        else:
            if not download_once:
                download_once = True
                print("file path: " + fpath)
                if os.path.exists(fname + temp_dir_ext):
                    print(
                        fname + temp_dir_ext +
                        " already exists, please move/backup that direcory to another place manually. Abort"
                    )  #to not blindly replace file
                    os._exit(1)
                tmp_dir = fname + temp_dir_ext
                my_epub = pypub.Epub(fname, epub_dir=tmp_dir)
                epub_dir = os.path.join(os.getcwd(), tmp_dir)
                print("epub_dir: " + epub_dir)
            if title_raw:
                try:
                    title = title.decode('utf-8')
                except:
                    pass
                try:  #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html
                    title_raw = title_raw.decode('utf-8')
                except:
                    pass
                title_raw = replacer(title_raw).replace('&', '&amp;').replace(
                    '<', '&lt;'
                ).replace(
                    '>', '&gt;'
                )  #unlike content, title can replace '&'(no space) like that since & may no space
                #, if content do like that will got no image, got visible &nbsp; text ...etc
            if args.all:
                if title_raw:
                    my_chapter = pypub.create_chapter_from_url(title=title_raw,
                                                               url=t_url)
                else:  #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though
                    my_chapter = pypub.create_chapter_from_url(t_url)
                #print(my_chapter.content)
                #my_chapter.content = replacer(my_chapter.content)
                my_chapter.title = replacer(my_chapter.title)
                #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add  <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error.
                #try:
                #    my_chapter.content = my_chapter.content.decode('utf-8')
                #except:
                #    pass #print("decode content err")
                #
                # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect !
                #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser')

                try:
                    my_chapter.title = my_chapter.title.decode('utf-8')
                except:  #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html
                    pass  #print("decode title err")
            else:
                #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace
                if title_raw:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title=title_raw, url=t_url)
                else:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title='/'.join(title.split('/')[-3:]), url=t_url)
                #print(my_chapter.content)
                #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" '))
            my_epub.add_chapter(my_chapter)
            my_epub.create_epub(os.getcwd())
            rm_tmp_files()
    return url  #return value used for rss feed mode only
def generate_epub(name, category="", start=None, end=None):
    """
    Main method.
    """

    # Collect post to sort them after
    posts = {}

    for page in range(1, 4):

        sleep_for_a_while()
        r = requests.get(
            'http://highscalability.com/blog/category/%s?currentPage=%s' %
            (category, page))
        html_doc = r.text

        soup = BeautifulSoup(html_doc, 'html.parser')
        for post in soup.select(".journal-entry"):
            #print(post)

            post_link = post.select(".journal-entry-text h2 a")[0]
            post_date = post.select(".journal-entry-float-date")[0]

            # Collect the HREF
            # Note: the link is useless because the list page contains the full post text.
            href = post_link.attrs['href']
            if not href.startswith("http://highscalability.com"):
                href = "http://highscalability.com%s" % href

            # Collect the title
            title = post_link.get_text()

            if not title:
                print("Fail to find the title: %s" % post)

            # Collect and parse the data
            date_text = post_date.get_text()  # Ex: December 16, 2016
            conv = time.strptime(date_text, "%b%d%Y")
            date_en = time.strftime("%Y-%m-%d", conv)  # Ex: 2016-12-16
            print(date_en)

            # Filter according the dates
            if start and date_en < start:
                continue
            if end and date_en >= end:
                continue

            print("Processing post %s (%s)" % (title, date_en))

            # Collect the content
            # List pages contain only the beginning of the posts.
            # We need to retrieve each post page to get the full text
            sleep_for_a_while()
            r = requests.get(href)
            if r.status_code != 200:
                print("Error: Unable to retrieve blog post content: %s" %
                      r.status_code)
                break

            post_doc = r.text
            post_soup = BeautifulSoup(post_doc, 'html.parser')
            content = post_soup.select(".journal-entry-text")[0]

            content_text = u"%s" % (str(content))

            # Post are traversed in reverse order
            posts[date_en] = {
                "date": date_text,
                "title": title,
                "content": content_text
            }

    # Sort the post starting from the oldest
    ordered_posts = collections.OrderedDict(sorted(posts.items()))

    # Generate the target file
    epub = pypub.Epub(name)
    print("Creating the epub...")
    for date_en, post in ordered_posts.iteritems():
        print("Adding post %s" % post["title"])
        c = pypub.create_chapter_from_string(post["content"],
                                             title=post["title"])
        epub.add_chapter(c)
        sleep_for_a_while()
    print("Ending epub generation")
    epub.create_epub(os.getcwd())
Beispiel #10
0
import os
import json

with open("serialinfo.json") as json_file:
    json_format = json.load(json_file)

with open("webseriallist.json") as json_file:
    json_info = json.load(json_file)

# Loop through the web serials in webseriallist.json
for serials in json_info["webserials"]:
    title = serials["serialtitle"]
    author = serials["serialauthor"]
    chapterurl = serials["serialurl"]

    my_epub = pypub.Epub(title + ' by ' + author)
    print('Downloading ' + title + ' by ' + author)

    # Loop through the chapters in a web serial
    while True:

        # Check if chapter is excluded
        exclude = False

        with open("exclusionlist.txt", "r") as exclusionlist:
            while True:
                line = exclusionlist.readline()
                if not line:
                    break
                if chapterurl == line.strip():
                    exclude = True
def main():
    global epub_dir
    if args.url:
        url = args.url
    else:
        url = input('URL: ').strip()
    url = process_url(url)
    #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this
    #    url = "/".join(url.split('/')[:-1])
    parsed_uri = urlparse(url)
    netloc = '{uri.netloc}/'.format(uri=parsed_uri)
    d_name = slugify(unicode(netloc))
    if args.pdf:
        if (not args.one) and (not os.path.isdir(d_name)):
            os.makedirs(d_name)
        ext = '.pdf'
    else:
        ext = '.epub'
    if args.print_date:
        print('Debugging\n')
        scrape(url, d_name, ext)
    elif args.one:
        d_name = d_name.strip()
        if args.pdf:
            fname = d_name + ext
        else:  #.epub will auto suffix
            fname = d_name + ext
        fpath = os.path.join(os.getcwd(), fname)
        while os.path.exists(fpath):
            fname = d_name + '_' + str(int(time.time())) + ext
            fpath = os.path.join(os.getcwd(), fname)
        try:
            if args.pdf:
                # [further:0] 'https://thehackernews.com/2019/09/phpmyadmin-csrf-exploit.html'
                # ... nid -1 -p, can't simply -1
                print('Create single pdf: ' + fpath)
                # test case(need default 3 seconds): https://www.quora.com/Why-does-the-loopback-interface-on-my-computer-has-65536-as-the-MTU-while-other-interfaces-has-1500-as-the-MTU
                pdfkit.from_url(
                    url,
                    fpath,
                    options={'--javascript-delay': args.js_delay * 1000})
            else:
                import_pypub()
                tmp_dir = d_name + temp_dir_ext
                my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir)
                print('Create single epub: ' + fpath)
                while True:
                    try:
                        print('\n[' + datetime.datetime.now().strftime(
                            "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url)
                        epub_dir = os.path.join(os.getcwd(), tmp_dir)
                        try:
                            '''
                                import trace
                                #print("sys path: ", sys.prefix, sys.exec_prefix)
                                tracer = trace.Trace(
                                    trace=1,
                                    #ignoredirs=[sys.prefix, sys.exec_prefix] )
                                    ignoredirs=[ '/usr/lib/python3/',  '/usr/lib/python3.6/', '/usr/lib/python3.8/',
                                    '/home/xiaobai/.local/lib/python3.6/site-packages/lxml/', 
                                     ],
                                    ignoremods=[ 'version', 'pyparsing', 'six', '_tokenizer', 'serialize', 'exceptions', 'request'
                                    , '_inputstream', 'etree', 'html5parser', '_structures', 'specifier', 'specifiers', 'serializer'
                                    , '_utils', '_compat'
                                    , '_htmlparser', 'element', 'dammit', 'universaldetector', 'codingstatemachine', 'utf8prober'
                                    , 'enums', 'mbcsgroupprober', 'charsetgroupprober', 'charsetprober', 'latin1prober'
                                    , 'charsetgroupprober', 'sbcharsetprober', 'hebrewprober', 'euctwprober', 'mbcharsetprober'
                                    , 'chardistribution', 'sbcsgroupprober', 'jpcntx', 'sjisprober', 'big5prober', 'cp949prober'
                                    , 'euckrprober', 'gb2312prober', 'eucjpprober', 'timeout', 'pyopenssl', 'SSL', 'poolmanager'
                                    , 'connectionpool', 'response', '_collections', 'core', 'intranges', 'binding', '_oid', 'x509'
                                    , 'decode_asn1', 'utils', 'extensions', 'general_name', 'cookies', 'models', 'structures'
                                    , '_internal_utils', 'sessions', 'adapters', 'hooks', 'retry'
                                    , 'connection', 'api', 'url', 'ssl_'
                                    , 'wait', 'crypto', '_util', 'backend', 'makefile'
                                    ]
                                    #count=1)
                                )
                                '''
                            #my_chapter = tracer.runfunc(pypub.create_chapter_from_url, url)
                            my_chapter = pypub.create_chapter_from_url(url)

                            # To replace title contains "&"" to "&amp;" , or else will not able open in kchmviewer
                            # Test case: https://blog.semmle.com/semmle-discovers-severe-vulnerability-ghostscript-postscript-pdf/
                            my_chapter.title = my_chapter.html_title

                            my_epub.add_chapter(my_chapter)
                            my_epub.create_epub(os.getcwd())
                            rm_tmp_files()
                        except ValueError as ve:  #https://pikachu.com is an invalid url or no network connection
                            traceback.print_exc()
                            print(ve)
                        try:
                            reply = input(
                                '\nPaste next <url> OR type \'n\' to exit: '
                            ).strip()
                        except EOFError:  #when use -1 and < list_of_lines_file, last line will raise EOFError
                            break
                        if (reply and reply[0].lower() != 'n'):
                            url = process_url(reply)
                        else:
                            break
                    except IOError as ioe:  #should allow next url if requests.get() in pypub's chapter.py timeout
                        print("\nIOError but still allow goto next chapter",
                              ioe)
                    except KeyboardInterrupt:
                        #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls.
                        reply = input(
                            '\n[' + datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") +
                            '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] '
                        ).strip()  #or ctrl+c again also can exit
                        if reply:
                            if reply == 's':
                                reply = input(
                                    '\nPaste next <url> OR type \'n\' to exit: '
                                ).strip()
                                if (reply and reply[0].lower() != 'n'):
                                    url = process_url(reply)
                                else:
                                    break
                            elif reply == 'q':
                                break
                            #else #continue/retry
                        #except Exception, ex:
                        #    print('single global ex: ' + ex)
        except IOError as ioe:
            print("IOError --one: ", ioe)
    elif not args.all:
        print('Download in rss feed mode')
        if args.feed:
            url = args.feed
        #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working
        #    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25'
        while url:
            url = download(url, url, d_name, ext)
    elif args.single:
        print('Download single year/month in website mode')
        download(url, url, d_name, ext)
    else:
        print('Download all in website mode')
        scrape(url, d_name, ext)
    print("\nDone")
Beispiel #12
0
import pypub

my_first_epub = pypub.Epub('My Second Epub')
my_first_chapter = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/FBReader')

my_first_epub.add_chapter(my_first_chapter)
my_first_chapter4 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/PocketBook_eReader')

my_first_epub.add_chapter(my_first_chapter4)

my_first_chapter1 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/Smashwords')

my_first_epub.add_chapter(my_first_chapter1)

my_first_chapter2 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/Raster_graphics')

my_first_epub.add_chapter(my_first_chapter2)

my_first_chapter3 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/FBReader')

my_first_epub.add_chapter(my_first_chapter3)

#my_first_chapter1 = pypub.create_chapter_from_url('https://en.wikipedia.org/wiki/EPUB')
#my_first_epub.add_chapter(my_first_chapter1)
my_first_epub.create_epub('D:/')