Ejemplo n.º 1
0
    # Create a set with fetched links (avoid loops...)
    links = OrderedSet(starturl)

    # Iterate on the links, and recursively download / convert
    fetchlinks = newlinks
    while len(fetchlinks) > 0:
        newlinks = OrderedSet()
        for url in fetchlinks:
            url = urlparse.urljoin(starturl, url)
            output = os.path.join(outputdir, urltoname(url))
            print >>sys.stderr, "\nProcessing ", url
            # Link on the same server?
            if urlparse.urlparse(url).netloc == urlparse.urlparse(starturl).netloc:
                retval = fetchurl(url, output, breadcrumbs)
                newlinks.update(retval["links"])
            else:
                print >>sys.stderr, "*** {} is not on the same server. Link skipped.".format(url)
        # Update sets of links
        links.update(fetchlinks)
        fetchlinks = newlinks - links

    # Fix local urls for the files of the output directory
    fixurls(outputdir, starturl)

    # Clean HTML code
    tidy(outputdir)

    # Convert equations to PNG
    fix_latex(outputdir)
Ejemplo n.º 2
0
    links = OrderedSet(starturl)

    # Iterate on the links, and recursively download / convert
    fetchlinks = newlinks
    while len(fetchlinks) > 0:
        newlinks = OrderedSet()
        for url in fetchlinks:
            url = urlparse.urljoin(starturl, url)
            output = os.path.join(outputdir, urltoname(url))
            print >> sys.stderr, "\nProcessing ", url
            # Link on the same server? If no match, search the list of alternative servers
            start_server = urlparse.urlparse(starturl).netloc
            link_server = urlparse.urlparse(url).netloc
            if (link_server == start_server or _get_alternate_server(link_server) == _get_alternate_server(start_server)):
                retval = fetchurl(url, output, breadcrumbs)
                newlinks.update(retval['links'])
            else:
                print >> sys.stderr, "*** {} is not on the same server. Link skipped.".format(url)
        # Update sets of links
        links.update(fetchlinks)
        fetchlinks = newlinks - links

    # Fix local urls for the files of the output directory
    fixurls(outputdir, starturl)

    # Clean HTML code
    tidy(outputdir)

    # Convert equations to PNG
    fix_latex(outputdir)