Exemple #1
0
def download_news(request, url, newsname):
    print('进入新闻页下载的处理函数')
    FromUserName = get_ACCESS_TOKEN(request)

    my_first_epub = pypub.Epub(newsname.decode())
    # my_first_epub = pypub.Epub('%s' % user)

    my_first_chapter = pypub.create_chapter_from_url(url)

    my_first_epub.add_chapter(my_first_chapter)

    my_first_epub.create_epub('download/' + FromUserName)
    list = User.objects.filter(username=FromUserName, newsname=newsname)
    print(list)
    # list = []
    if len(list) != 0:
        data = '该新闻已被保存,请选择其它新闻。'
        return render(request, 'test6.html', {'data': data})
    else:
        path = 'download' + '/' + FromUserName + '/' + newsname + '.epub'
        print(path)
        with open(path) as f:
            news_data = f.read()
        md5str = news_data
        #m = hashlib.md5(str(time.clock()).encode('utf-8'))
        #m.hexdigest()
        m1 = hashlib.md5()
        m1.update(md5str.decode('latin-1'))
        token = m1.hexdigest()
        User.objects.create(username=FromUserName,
                            newsname=newsname,
                            status=False,
                            md5=token)
        data = '该新闻页保存成功'
        return render(request, 'test6.html', {'data': data})
Exemple #2
0
def main():
    global epub_dir
    if args.url:
        url = args.url
    else:
        url = input('URL: ').strip()
    url = process_url(url)
    #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this
    #    url = "/".join(url.split('/')[:-1])
    parsed_uri = urlparse(url)
    netloc = '{uri.netloc}/'.format(uri=parsed_uri)
    d_name = slugify(unicode(netloc))
    if args.pdf:
        if (not args.one) and (not os.path.isdir(d_name)):
            os.makedirs(d_name)
        ext = '.pdf'
    else:
        ext = '.epub'
    if args.print_date:
        print('Debugging\n')
        scrape(url, d_name, ext)
    elif args.one:
        d_name = d_name.strip()
        if args.pdf:
            fname = d_name + ext
        else:  #.epub will auto suffix
            fname = d_name + ext
        fpath = os.path.join(os.getcwd(), fname)
        while os.path.exists(fpath):
            fname = d_name + '_' + str(int(time.time())) + ext
            fpath = os.path.join(os.getcwd(), fname)
        try:
            if args.pdf:
                print('Create single pdf: ' + fpath)
                pdfkit.from_url(url, fpath)
            else:
                import_pypub()
                tmp_dir = d_name + temp_dir_ext
                my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir)
                print('Create single epub: ' + fpath)
                while True:
                    try:
                        print('\n[' + datetime.datetime.now().strftime(
                            "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url)
                        epub_dir = os.path.join(os.getcwd(), tmp_dir)
                        try:
                            my_chapter = pypub.create_chapter_from_url(url)
                            my_epub.add_chapter(my_chapter)
                            my_epub.create_epub(os.getcwd())
                            rm_tmp_files()
                        except ValueError as ve:  #https://pikachu.com is an invalid url or no network connection
                            print(ve)
                        reply = input(
                            '\nPaste next <url> OR type \'n\' to exit: '
                        ).strip()
                        if (reply and reply[0].lower() != 'n'):
                            url = process_url(reply)
                        else:
                            break
                    except IOError as ioe:  #should allow next url if requests.get() in pypub's chapter.py timeout
                        print("\nIOError but still allow goto next chapter",
                              ioe)
                    except KeyboardInterrupt:
                        #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls.
                        reply = input(
                            '\n[' + datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") +
                            '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] '
                        ).strip()  #or ctrl+c again also can exit
                        if reply:
                            if reply == 's':
                                reply = input(
                                    '\nPaste next <url> OR type \'n\' to exit: '
                                ).strip()
                                if (reply and reply[0].lower() != 'n'):
                                    url = process_url(reply)
                                else:
                                    break
                            elif reply == 'q':
                                break
                            #else #continue/retry
        except IOError as ioe:
            print("IOError --one: ", ioe)
    elif not args.all:
        print('Download in rss feed mode')
        if args.feed:
            url = args.feed
        #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working
        #    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25'
        while url:
            url = download(url, url, d_name, ext)
    elif args.single:
        print('Download single year/month in website mode')
        download(url, url, d_name, ext)
    else:
        print('Download all in website mode')
        scrape(url, d_name, ext)
    print("\nDone")
Exemple #3
0
def download(url, h, d_name, ext):
    global download_once
    global init_url_once
    global img_css_style
    global my_epub
    global epub_dir
    if not args.pdf:
        import_pypub()

    #e.g. 'https://diannaoxiaobai.blogspot.com/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=https://diannaoxiaobai.blogspot.com/2018/'
    visit_link = url
    orig_url = url
    if args.all:
        y_url = url + "/?action=getTitles&widgetId=BlogArchive1&widgetType=BlogArchive&responseType=js&path=" + h
        print("Scraping year... " + y_url)
        try:
            r = urlopen(y_url).read()
        except HTTPError as he:
            print(
                '\nNote that -a -s only allow if url has /year/[month] format, pls check your url\n'
            )
            clean_up()
            os._exit(1)
        if sys.version_info[0] >= 3:
            r = r.decode('utf-8')
        t = r.split("'title'")
        t = t[1:]
    else:
        url = process_rss_link(url)
        print("Scraping rss feed... " + url)
        r = feedparser.parse(
            url
        )  #, request_headers={'User-Agent': UA, 'Referer': url}) #I noticed https://blog.mozilla.org/security/feed/1 (/1 non exist) is working in feedparser, lolr
        #print(r.headers)
        t = r['entries']
        #if (not t) or ("link" not in r['feed'].keys()): #if got entries then whe need retry ? no need check link
        if (not init_url_once) and (
                not t
        ):  #'User does not have permission to read this blog.' of rss feed come here
            init_url_once = True
            #parsed_url = urlparse(url)
            #if not '{uri.netloc}'.format(uri=parsed_url).endswith('wordpress.com'):
            try:
                print("Try to scrape rss feed url automatically ... " +
                      orig_url)
                ##r = urlopen(orig_url).read() #https://medium.com/bugbountywriteup got check UA if urllib2 UA then not authorized
                if sys.version_info[0] >= 3:
                    req = urllib.request.Request(orig_url,
                                                 data=None,
                                                 headers={'User-Agent': UA})
                    r = urllib.request.urlopen(req).read()
                else:
                    req = urllib2.Request(orig_url, headers={'User-Agent': UA})
                    r = urllib2.urlopen(req).read()
            except Exception as e:
                print(e)
                print(
                    "Request webpage failed, please check your network OR authorized to access that url."
                )
                clean_up()
                os._exit(
                    1
                )  #don't use sys.exit(-1) if don't want to traceback to main() to print exception
            soup = BeautifulSoup(r, "lxml")
            data = soup.findAll('link', attrs={'type': 'application/rss+xml'})
            if not data:  #https://github.com/RSS-Bridge/rss-bridge/issues/566 only has atom
                data = soup.findAll('link',
                                    attrs={'type': 'application/atom+xml'})
            if not data:
                data = soup.findAll('a',
                                    attrs={'href': '/rss/'
                                           })  #https://blog.google/products/
            if data:
                url = data[0].get("href")
                url = process_rss_link(url)
                if url.startswith(
                        '/'
                ):  #http://sectools.org/tag/sploits/ only has href="/feed/"
                    parsed_orig_uri = urlparse(orig_url)
                    url = '{uri.scheme}://{uri.netloc}'.format(
                        uri=parsed_orig_uri) + url
                print("Scraping rss feed one more time ... " + url)
                r = feedparser.parse(url)
                t = r['entries']
                if not t:
                    t = []
            else:
                t = []
        else:  #unlike blogspot, wordpress always got t, so need set true here
            init_url_once = True
        parsed_url = urlparse(url)
        is_wordpress = '{uri.netloc}'.format(
            uri=parsed_url).endswith('wordpress.com')
        if not is_wordpress:  #only check next if 1st check is False, or lese 2nd check override 1st result
            try:
                if 'keys' in dir(r):
                    is_wordpress = r.get('feed', {}).get(
                        'generator', '').startswith('https://wordpress.org/')
            except Exception as e:
                print('parse generator error', e)
        if is_wordpress and t:  #increment paged only if current page got entries, i.e. t
            #parsed_keys = urlparse.parse.parse_qs(parsed_url.query) #my python 2 don't have parse_qs
            if 'paged=' in parsed_url.query:
                wp_paged_v = int(
                    parsed_url.query[parsed_url.query.rindex('paged=') +
                                     len('paged='):])
                #uri.path default prefix with '/' if not empty, so don't set '/' after netloc or else keep increase '////...' in each page
                url = '{uri.scheme}://{uri.netloc}{uri.path}?'.format(
                    uri=parsed_url) + parsed_url.query.replace(
                        'paged=' + str(wp_paged_v),
                        'paged=' + str(wp_paged_v + 1))
            else:
                url = ''
                print('no next')
        elif ("keys" in dir(r)) and ('link' in r['feed'].keys()):
            l = r['feed']['links']
            if l:
                got_next = False
                for ll in l:
                    if ll['rel'] == 'next':
                        #if ll['href'] != url: #don't have next link is same case to test
                        url = ll['href']
                        got_next = True
                        break
                if not got_next:
                    url = ''
            else:
                url = ''
        elif not t:  #no need care if next page rss index suddenly change and no content case
            url = ''
            print_rss_err()

    count = 0
    for tt in t:
        count += 1
        title_raw = ''
        title_is_link = False
        if not args.all:
            #e.g. parser.parse('2012-12-22T08:36:46.043-08:00').strftime('%B %d, %Y, %H:%M %p')
            h = ''
            #https://github.com/RSS-Bridge/rss-bridge/commits/master.atom only has 'updated'
            post_date = tt.get('published', tt.get('updated', ''))
            t_date = ''
            try:
                if args.locale:
                    if sys.version_info[0] >= 3:
                        t_date = parse_locale(post_date)
                    else:
                        t_date = parse_locale(post_date).decode('utf-8')
                else:
                    t_date = date_parser.parse(post_date).strftime(
                        '%B %d, %Y, %H:%M %p')
            except ValueError:  #Unknown string format, e.g. https://www.xul.fr/en-xml-rss.html got random date format such as 'Wed, 29 Jul 09 15:56:54  0200'
                t_date = post_date
            for feed_links in tt['links']:
                if feed_links['rel'] == 'alternate':
                    visit_link = feed_links['href']
            title_raw = tt['title'].strip()
            title_pad = title_raw + ' '
            if (not args.pdf) or (
                    not tt['title']
            ):  #epub got problem copy link from text, so epub always shows link
                tt['title'] = visit_link
                title_is_link = True
            if args.pdf:  #pdf with img css causes image not appear at all
                img_css_style = ''

            author = tt.get('author_detail', {}).get('name')
            if not author:
                author = tt.get('site_name', '')  #https://blog.google/rss/

            h = '<div><small>' + author + ' ' + t_date + '<br/><i>' + title_pad + '<a style="text-decoration:none;color:black" href="' + visit_link + '">' + tt[
                'title'] + '</a></i></small><br/><br/></div>' + img_css_style
            #<hr style="border-top: 1px solid #000000; background: transparent;">

            media_content = ''
            try:
                if 'media_content' in tt:  #wordpress/blog.google got list of images with link, e.g. darrentcy.wordpress.com
                    for tm in tt['media_content']:
                        #pitfall: python 3 dict no has_key() attr
                        if ('medium' in tm) and (tm['medium']
                                                 == 'image') and 'url' in tm:
                            media_content += '<img src="' + tm['url'] + '" >'
                            #media_content += '<img style="display: block; max-height: 100%; max-width: 100%" src="' + tm['url'] + '" >'
                #[UPDATE] shouldn't do like that, since thumbnails of feeds normally duplicated with feed without media_content
                #... which seems act as single thumbnail on webpage scraping metadata usage only.
                #... and seems like https://gigaom.com/feed/ thumbnail is not showing in webpage.
                #elif 'media_thumbnail' in tt: #https://gigaom.com/feed/ only has thumbnail
                #    for tm in tt['media_thumbnail']:
                #        if 'url' in tm:
                #            media_content += '<img src="' + tm['url'] + '" >'
            except Exception as e:
                print(e)
                print('parse media error')

            #pdfkit need specific charset, epub seems no need
            if args.pdf:  #just now got 1 post shows blank but got div in feed, then noticed it's white color font, lol
                h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt[
                    'summary'].replace(
                        '<div class="separator"',
                        '<div class="separator" align="center" '
                    ) + media_content + '</div></body>'
                #h = '<head><meta charset="UTF-8"></head><body><div align="center">' + h + tt['summary'].replace('<br /><br /><br />', '<br />') + media_content + '</div></body>'
            else:  #epub can't set body/head
                #h_soup = BeautifulSoup(tt['summary'], "lxml")
                #for pre in h_soup.find_all('pre'):
                #    print("pre: ", pre)
                #h = h + '<div align="center">' + tt['summary'].replace('<div class="separator"', '<div class="separator" align="center" ') + media_content + "</div>" #no need do replace anymore since the align center should control by global <div>
                h = h + '<div align="center">' + tt['summary'].replace(
                    '<br /><br /><br />', '<br />') + media_content + "</div>"
                #h = h + '<div align="center">' + tt['summary'] + media_content + "</div>"
                #h = h + tt['summary'] + media_content
            title = tt['title']
            t_url = visit_link
        else:
            field = tt.split("'")
            title = field[1]
            title_raw = title.strip()
            t_url = field[5]
        print('\ntitle: ' + title_raw)
        print('link: ' + t_url)
        if args.pdf:
            print('Download html as PDF, please be patient...' + str(count) +
                  '/' + str(len(t)))
        else:
            print('Download html as EPUB, please be patient...' + str(count) +
                  '/' + str(len(t)))
        if args.pdf:
            if title_is_link:  #else just leave slash with empty
                title = '/'.join(title.split('/')[-3:])
            if sys.version_info[0] >= 3:
                fname = os.path.join(d_name, slugify(unicode(title)))
            else:
                print(title)
                try:
                    title = title.decode('utf-8')
                except:
                    pass  #print('calm down, is normal decode error')
                title = replacer(title)
                #fname = os.path.join( d_name, slugify(title.decode('utf-8')))
                fname = os.path.join(d_name, slugify(title))
        else:  #no point do set fname based on title since epub is single file only with multiple chapters
            fname = d_name
        fpath = os.path.join(os.getcwd(), fname)
        if args.pdf:
            check_path = os.path.join(fpath + ext)
        else:
            check_path = fpath[:-1] + ext
        if (not download_once) and os.path.exists(check_path):
            if args.pdf:
                fpath = fpath + '_' + str(int(time.time())) + ext
            else:
                fname = fname[:-1] + ' ' + str(int(
                    time.time()))  #pypub truncated _, so can't use '_'
        else:
            if args.pdf:
                fpath += ext
            else:
                fpath = fpath[:-1] + ext
                fname = fname[:-1]
        if args.pdf:
            print("file path: " + fpath)
            #pdf = weasyprint.HTML(t_url).write_pdf()
            #file( d_name + "/" + slugify(unicode(title)) + ".pdf", 'w' ).write(pdf)
            if args.all:
                try:
                    pdfkit.from_url(t_url, fpath)
                except IOError as ioe:
                    print("pdfkit IOError")
            else:
                try:
                    #https://security.googleblog.com/2013/10/dont-mess-with-my-browser.html site can't open in kchmviewer bcoz of this
                    #, which you direct unzip .EPUB and open that xhtml will got error
                    #-f 'https://security.googleblog.com/feeds/posts/default?start-index=179&max-results=1' direct jump to desired index to test
                    #rf: https://www.w3.org/wiki/Common_HTML_entities_used_for_typography
                    #narrow down OEBPS/toc.nc by removing list of items, then download by index+repack+<open_in_web_browser_OR_kchmviewer> above to know which portion of items trigger the xml error #got case toc.nc itself contains '&' which must replace with `&amp;`
                    h = replacer(h)
                    pdfkit.from_string(h, fpath)
                except IOError as ioe:
                    print('Exception IOError: ' + repr(ioe))
        else:
            if not download_once:
                download_once = True
                print("file path: " + fpath)
                if os.path.exists(fname + temp_dir_ext):
                    print(
                        fname + temp_dir_ext +
                        " already exists, please move/backup that direcory to another place manually. Abort"
                    )  #to not blindly replace file
                    os._exit(1)
                tmp_dir = fname + temp_dir_ext
                my_epub = pypub.Epub(fname, epub_dir=tmp_dir)
                epub_dir = os.path.join(os.getcwd(), tmp_dir)
                print("epub_dir: " + epub_dir)
            if title_raw:
                try:
                    title = title.decode('utf-8')
                except:
                    pass
                try:  #fixed -as http://miniechung1998.blogspot.com/2012/12/xd-xd.html
                    title_raw = title_raw.decode('utf-8')
                except:
                    pass
                title_raw = replacer(title_raw).replace('&', '&amp;').replace(
                    '<', '&lt;'
                ).replace(
                    '>', '&gt;'
                )  #unlike content, title can replace '&'(no space) like that since & may no space
                #, if content do like that will got no image, got visible &nbsp; text ...etc
            if args.all:
                if title_raw:
                    my_chapter = pypub.create_chapter_from_url(title=title_raw,
                                                               url=t_url)
                else:  #no choice like that and better not set with t_url, use other editor if kchmviewer error, should unlikely happen though
                    my_chapter = pypub.create_chapter_from_url(t_url)
                #print(my_chapter.content)
                #my_chapter.content = replacer(my_chapter.content)
                my_chapter.title = replacer(my_chapter.title)
                #sigil viewer will warning and auto convert for you, e.g. /<img> become </>, replace <!DOCTYPE html> to <?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">, Add  <title></title> ...etc, this is normal and shouldn't have extra work to do, while kchmviewer able to render it without error.
                #try:
                #    my_chapter.content = my_chapter.content.decode('utf-8')
                #except:
                #    pass #print("decode content err")
                #
                # The correct way to replace, you can't direct `my_chapter.content = 'xxx'` and expect it take effect !
                #my_chapter._content_tree = BeautifulSoup(my_chapter.content, 'html.parser')

                try:
                    my_chapter.title = my_chapter.title.decode('utf-8')
                except:  #-a http://cuhkt48.blogspot.com/2016/07/blog-post.html
                    pass  #print("decode title err")
            else:
                #h = replacer(h) #'https://www.blogger.com/feeds/1176949257541686127/posts/default?start-index=251&max-results=25' -> https://security.googleblog.com/2009/03/reducing-xss-by-way-of-automatic.html got <prev> and body, so don't blindly unescape all #might need filter by pre and allow other to replace, need to test more to know got error or not without replace
                if title_raw:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title=title_raw, url=t_url)
                else:
                    my_chapter = pypub.create_chapter_from_string(
                        h, title='/'.join(title.split('/')[-3:]), url=t_url)
                #print(my_chapter.content)
                #my_chapter = pypub.create_chapter_from_string(r['entries'][0]['summary'].replace('<div class="separator"', '<div class="separator" align="center" '))
            my_epub.add_chapter(my_chapter)
            my_epub.create_epub(os.getcwd())
            rm_tmp_files()
    return url  #return value used for rss feed mode only
def main():
    global epub_dir
    if args.url:
        url = args.url
    else:
        url = input('URL: ').strip()
    url = process_url(url)
    #if url.endswith('.html'): #no point do like that for -f and it will need .html for -a, so don't do this
    #    url = "/".join(url.split('/')[:-1])
    parsed_uri = urlparse(url)
    netloc = '{uri.netloc}/'.format(uri=parsed_uri)
    d_name = slugify(unicode(netloc))
    if args.pdf:
        if (not args.one) and (not os.path.isdir(d_name)):
            os.makedirs(d_name)
        ext = '.pdf'
    else:
        ext = '.epub'
    if args.print_date:
        print('Debugging\n')
        scrape(url, d_name, ext)
    elif args.one:
        d_name = d_name.strip()
        if args.pdf:
            fname = d_name + ext
        else:  #.epub will auto suffix
            fname = d_name + ext
        fpath = os.path.join(os.getcwd(), fname)
        while os.path.exists(fpath):
            fname = d_name + '_' + str(int(time.time())) + ext
            fpath = os.path.join(os.getcwd(), fname)
        try:
            if args.pdf:
                # [further:0] 'https://thehackernews.com/2019/09/phpmyadmin-csrf-exploit.html'
                # ... nid -1 -p, can't simply -1
                print('Create single pdf: ' + fpath)
                # test case(need default 3 seconds): https://www.quora.com/Why-does-the-loopback-interface-on-my-computer-has-65536-as-the-MTU-while-other-interfaces-has-1500-as-the-MTU
                pdfkit.from_url(
                    url,
                    fpath,
                    options={'--javascript-delay': args.js_delay * 1000})
            else:
                import_pypub()
                tmp_dir = d_name + temp_dir_ext
                my_epub = pypub.Epub(fname[:-5], epub_dir=tmp_dir)
                print('Create single epub: ' + fpath)
                while True:
                    try:
                        print('\n[' + datetime.datetime.now().strftime(
                            "%Y-%m-%d %H:%M:%S") + '] Trying url: ' + url)
                        epub_dir = os.path.join(os.getcwd(), tmp_dir)
                        try:
                            '''
                                import trace
                                #print("sys path: ", sys.prefix, sys.exec_prefix)
                                tracer = trace.Trace(
                                    trace=1,
                                    #ignoredirs=[sys.prefix, sys.exec_prefix] )
                                    ignoredirs=[ '/usr/lib/python3/',  '/usr/lib/python3.6/', '/usr/lib/python3.8/',
                                    '/home/xiaobai/.local/lib/python3.6/site-packages/lxml/', 
                                     ],
                                    ignoremods=[ 'version', 'pyparsing', 'six', '_tokenizer', 'serialize', 'exceptions', 'request'
                                    , '_inputstream', 'etree', 'html5parser', '_structures', 'specifier', 'specifiers', 'serializer'
                                    , '_utils', '_compat'
                                    , '_htmlparser', 'element', 'dammit', 'universaldetector', 'codingstatemachine', 'utf8prober'
                                    , 'enums', 'mbcsgroupprober', 'charsetgroupprober', 'charsetprober', 'latin1prober'
                                    , 'charsetgroupprober', 'sbcharsetprober', 'hebrewprober', 'euctwprober', 'mbcharsetprober'
                                    , 'chardistribution', 'sbcsgroupprober', 'jpcntx', 'sjisprober', 'big5prober', 'cp949prober'
                                    , 'euckrprober', 'gb2312prober', 'eucjpprober', 'timeout', 'pyopenssl', 'SSL', 'poolmanager'
                                    , 'connectionpool', 'response', '_collections', 'core', 'intranges', 'binding', '_oid', 'x509'
                                    , 'decode_asn1', 'utils', 'extensions', 'general_name', 'cookies', 'models', 'structures'
                                    , '_internal_utils', 'sessions', 'adapters', 'hooks', 'retry'
                                    , 'connection', 'api', 'url', 'ssl_'
                                    , 'wait', 'crypto', '_util', 'backend', 'makefile'
                                    ]
                                    #count=1)
                                )
                                '''
                            #my_chapter = tracer.runfunc(pypub.create_chapter_from_url, url)
                            my_chapter = pypub.create_chapter_from_url(url)

                            # To replace title contains "&"" to "&amp;" , or else will not able open in kchmviewer
                            # Test case: https://blog.semmle.com/semmle-discovers-severe-vulnerability-ghostscript-postscript-pdf/
                            my_chapter.title = my_chapter.html_title

                            my_epub.add_chapter(my_chapter)
                            my_epub.create_epub(os.getcwd())
                            rm_tmp_files()
                        except ValueError as ve:  #https://pikachu.com is an invalid url or no network connection
                            traceback.print_exc()
                            print(ve)
                        try:
                            reply = input(
                                '\nPaste next <url> OR type \'n\' to exit: '
                            ).strip()
                        except EOFError:  #when use -1 and < list_of_lines_file, last line will raise EOFError
                            break
                        if (reply and reply[0].lower() != 'n'):
                            url = process_url(reply)
                        else:
                            break
                    except IOError as ioe:  #should allow next url if requests.get() in pypub's chapter.py timeout
                        print("\nIOError but still allow goto next chapter",
                              ioe)
                    except KeyboardInterrupt:
                        #If you paste all links in once, then this need some time to trigger, but then next url only able to run one url since all the rest url get flush after KeyboardInterrupt, you can just find by url in link page and then copy/paste the remaining urls.
                        reply = input(
                            '\n[' + datetime.datetime.now().strftime(
                                "%Y-%m-%d %H:%M:%S") +
                            '] [r]etry OR [s]kip to next url OR [q]uit ? [r/s/q] '
                        ).strip()  #or ctrl+c again also can exit
                        if reply:
                            if reply == 's':
                                reply = input(
                                    '\nPaste next <url> OR type \'n\' to exit: '
                                ).strip()
                                if (reply and reply[0].lower() != 'n'):
                                    url = process_url(reply)
                                else:
                                    break
                            elif reply == 'q':
                                break
                            #else #continue/retry
                        #except Exception, ex:
                        #    print('single global ex: ' + ex)
        except IOError as ioe:
            print("IOError --one: ", ioe)
    elif not args.all:
        print('Download in rss feed mode')
        if args.feed:
            url = args.feed
        #else: shouldn't do like that, it should depends on later scrape the rss link in webpage, or else https://blog.mozilla.org/security/ not working
        #    url = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) + 'feeds/posts/default?start-index=1&max-results=25'
        while url:
            url = download(url, url, d_name, ext)
    elif args.single:
        print('Download single year/month in website mode')
        download(url, url, d_name, ext)
    else:
        print('Download all in website mode')
        scrape(url, d_name, ext)
    print("\nDone")
Exemple #5
0
import pypub

my_first_epub = pypub.Epub('My Second Epub')
my_first_chapter = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/FBReader')

my_first_epub.add_chapter(my_first_chapter)
my_first_chapter4 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/PocketBook_eReader')

my_first_epub.add_chapter(my_first_chapter4)

my_first_chapter1 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/Smashwords')

my_first_epub.add_chapter(my_first_chapter1)

my_first_chapter2 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/Raster_graphics')

my_first_epub.add_chapter(my_first_chapter2)

my_first_chapter3 = pypub.create_chapter_from_url(
    'https://en.wikipedia.org/wiki/FBReader')

my_first_epub.add_chapter(my_first_chapter3)

#my_first_chapter1 = pypub.create_chapter_from_url('https://en.wikipedia.org/wiki/EPUB')
#my_first_epub.add_chapter(my_first_chapter1)
my_first_epub.create_epub('D:/')