def fetch_page( url=None, load_complete=lambda browser, url, recursion_level: True, links=lambda browser, url, recursion_level: (), keep_only=(), remove_after=None, remove_before=None, remove=(), remove_javascript=True, delay=0, preprocess_browser=lambda browser, url, stage, recursion_level:None, postprocess_html=lambda root, url, recursion_level: root, resource_cache={}, output_dir=None, browser=None, recursion_level=0 ): output_dir = output_dir or os.getcwdu() if browser is None: browser = jsbrowser() if delay: time.sleep(delay) # Load the DOM if url is not None: start_time = time.time() browser.start_load(url) while not load_complete(browser, url, recursion_level): browser.run_for_a_time(0.1) if time.time() - start_time > browser.default_timeout: raise Timeout('Timed out while waiting for %s to load' % url) children = links(browser, url, recursion_level) # Cleanup the DOM clean_dom( browser, url, recursion_level, preprocess_browser, remove_javascript, keep_only, remove_after, remove_before, remove) # Download resources download_resources(browser, resource_cache, output_dir) # Get HTML from the DOM pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)] # Fetch the linked pages for i, curl in enumerate(children): odir = os.path.join(output_dir, 'link%d' % (i + 1)) if not os.path.exists(odir): os.mkdir(odir) try: pages.extend(fetch_page( curl, load_complete=load_complete, links=links, keep_only=keep_only, remove_after=remove_after, remove_before=remove_before, remove=remove, preprocess_browser=preprocess_browser, postprocess_html=postprocess_html, resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay, recursion_level=recursion_level+1)) except AbortFetch: continue return tuple(pages)
def fetch_page( url=None, load_complete=lambda browser, url, recursion_level: True, links=lambda browser, url, recursion_level: (), keep_only=(), remove_after=None, remove_before=None, remove=(), remove_javascript=True, delay=0, preprocess_browser=lambda browser, url, stage, recursion_level:None, postprocess_html=lambda root, url, recursion_level: root, resource_cache={}, output_dir=None, browser=None, recursion_level=0 ): output_dir = output_dir or os.getcwdu() if browser is None: browser = jsbrowser() if delay: time.sleep(delay) # Load the DOM if url is not None: start_time = time.time() browser.start_load(url) while not load_complete(browser, url, recursion_level): browser.run_for_a_time(0.1) if time.time() - start_time > browser.default_timeout: from calibre.web.jsbrowser.browser import Timeout raise Timeout('Timed out while waiting for %s to load' % url) children = links(browser, url, recursion_level) # Cleanup the DOM clean_dom( browser, url, recursion_level, preprocess_browser, remove_javascript, keep_only, remove_after, remove_before, remove) # Download resources download_resources(browser, resource_cache, output_dir) # Get HTML from the DOM pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)] # Fetch the linked pages for i, curl in enumerate(children): odir = os.path.join(output_dir, 'link%d' % (i + 1)) if not os.path.exists(odir): os.mkdir(odir) try: pages.extend(fetch_page( curl, load_complete=load_complete, links=links, keep_only=keep_only, remove_after=remove_after, remove_before=remove_before, remove=remove, preprocess_browser=preprocess_browser, postprocess_html=postprocess_html, resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay, recursion_level=recursion_level+1)) except AbortFetch: continue return tuple(pages)
download_resources(browser, resource_cache, output_dir) # Get HTML from the DOM pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)] # Fetch the linked pages for i, curl in enumerate(children): odir = os.path.join(output_dir, 'link%d' % (i + 1)) if not os.path.exists(odir): os.mkdir(odir) try: pages.extend(fetch_page( curl, load_complete=load_complete, links=links, keep_only=keep_only, remove_after=remove_after, remove_before=remove_before, remove=remove, preprocess_browser=preprocess_browser, postprocess_html=postprocess_html, resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay, recursion_level=recursion_level+1)) except AbortFetch: continue return tuple(pages) if __name__ == '__main__': browser = jsbrowser() fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser, links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1), keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))