Ejemplo n.º 1
0
def fetch_page(
    url=None,
    load_complete=lambda browser, url, recursion_level: True,
    links=lambda browser, url, recursion_level: (),
    keep_only=(),
    remove_after=None,
    remove_before=None,
    remove=(),
    remove_javascript=True,
    delay=0,
    preprocess_browser=lambda browser, url, stage, recursion_level:None,
    postprocess_html=lambda root, url, recursion_level: root,
    resource_cache={},
    output_dir=None,
    browser=None,
    recursion_level=0
    ):

    output_dir = output_dir or os.getcwdu()
    if browser is None:
        browser = jsbrowser()

    if delay:
        time.sleep(delay)

    # Load the DOM
    if url is not None:
        start_time = time.time()
        browser.start_load(url)
        while not load_complete(browser, url, recursion_level):
            browser.run_for_a_time(0.1)
            if time.time() - start_time > browser.default_timeout:
                raise Timeout('Timed out while waiting for %s to load' % url)

    children = links(browser, url, recursion_level)

    # Cleanup the DOM
    clean_dom(
        browser, url, recursion_level, preprocess_browser,
        remove_javascript, keep_only, remove_after, remove_before, remove)

    # Download resources
    download_resources(browser, resource_cache, output_dir)

    # Get HTML from the DOM
    pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]

    # Fetch the linked pages
    for i, curl in enumerate(children):
        odir = os.path.join(output_dir, 'link%d' % (i + 1))
        if not os.path.exists(odir):
            os.mkdir(odir)
        try:
            pages.extend(fetch_page(
                curl, load_complete=load_complete, links=links, keep_only=keep_only,
                remove_after=remove_after, remove_before=remove_before, remove=remove,
                preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
                resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay,
                recursion_level=recursion_level+1))
        except AbortFetch:
            continue
    return tuple(pages)
Ejemplo n.º 2
0
def fetch_page(
    url=None,
    load_complete=lambda browser, url, recursion_level: True,
    links=lambda browser, url, recursion_level: (),
    keep_only=(),
    remove_after=None,
    remove_before=None,
    remove=(),
    remove_javascript=True,
    delay=0,
    preprocess_browser=lambda browser, url, stage, recursion_level:None,
    postprocess_html=lambda root, url, recursion_level: root,
    resource_cache={},
    output_dir=None,
    browser=None,
    recursion_level=0
    ):

    output_dir = output_dir or os.getcwdu()
    if browser is None:
        browser = jsbrowser()

    if delay:
        time.sleep(delay)

    # Load the DOM
    if url is not None:
        start_time = time.time()
        browser.start_load(url)
        while not load_complete(browser, url, recursion_level):
            browser.run_for_a_time(0.1)
            if time.time() - start_time > browser.default_timeout:
                from calibre.web.jsbrowser.browser import Timeout
                raise Timeout('Timed out while waiting for %s to load' % url)

    children = links(browser, url, recursion_level)

    # Cleanup the DOM
    clean_dom(
        browser, url, recursion_level, preprocess_browser,
        remove_javascript, keep_only, remove_after, remove_before, remove)

    # Download resources
    download_resources(browser, resource_cache, output_dir)

    # Get HTML from the DOM
    pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]

    # Fetch the linked pages
    for i, curl in enumerate(children):
        odir = os.path.join(output_dir, 'link%d' % (i + 1))
        if not os.path.exists(odir):
            os.mkdir(odir)
        try:
            pages.extend(fetch_page(
                curl, load_complete=load_complete, links=links, keep_only=keep_only,
                remove_after=remove_after, remove_before=remove_before, remove=remove,
                preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
                resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay,
                recursion_level=recursion_level+1))
        except AbortFetch:
            continue
    return tuple(pages)
Ejemplo n.º 3
0
    download_resources(browser, resource_cache, output_dir)

    # Get HTML from the DOM
    pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]

    # Fetch the linked pages
    for i, curl in enumerate(children):
        odir = os.path.join(output_dir, 'link%d' % (i + 1))
        if not os.path.exists(odir):
            os.mkdir(odir)
        try:
            pages.extend(fetch_page(
                curl, load_complete=load_complete, links=links, keep_only=keep_only,
                remove_after=remove_after, remove_before=remove_before, remove=remove,
                preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
                resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay,
                recursion_level=recursion_level+1))
        except AbortFetch:
            continue
    return tuple(pages)

if __name__ == '__main__':
    browser = jsbrowser()
    fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
               links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
               keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))




Ejemplo n.º 4
0
    download_resources(browser, resource_cache, output_dir)

    # Get HTML from the DOM
    pages = [save_html(browser, output_dir, postprocess_html, url, recursion_level)]

    # Fetch the linked pages
    for i, curl in enumerate(children):
        odir = os.path.join(output_dir, 'link%d' % (i + 1))
        if not os.path.exists(odir):
            os.mkdir(odir)
        try:
            pages.extend(fetch_page(
                curl, load_complete=load_complete, links=links, keep_only=keep_only,
                remove_after=remove_after, remove_before=remove_before, remove=remove,
                preprocess_browser=preprocess_browser, postprocess_html=postprocess_html,
                resource_cache=resource_cache, output_dir=odir, browser=browser, delay=delay,
                recursion_level=recursion_level+1))
        except AbortFetch:
            continue
    return tuple(pages)

if __name__ == '__main__':
    browser = jsbrowser()
    fetch_page('http://www.time.com/time/magazine/article/0,9171,2145057,00.html', browser=browser,
               links=partial(links_from_selectors, ('.wp-paginate a.page[href]',), 1),
               keep_only=('article.post',), remove=('.entry-sharing', '.entry-footer', '.wp-paginate', '.post-rail'))