Example #1
0
def _check_cache(subpath, purge):
    #parse the url
    parsed = urlparse.urlparse(subpath)
    if parsed.path != "/" and parsed.path != "":
        local_path = cacheUrl(parsed.path, parsed.netloc)[1:]
    else:
        local_path = cacheUrl("/index", parsed.netloc)[1:]
    # add URL params to path
    args = ""
    for a in request.args:
        if len(args) == 0:
            args = "?" + a + "=" + request.args.get(a)
        else:
            args += "&" + a + "=" + request.args.get(a)
    local_path += args
    #check if it's in the cache
    found = True
    try:
        if not os.path.exists(local_path) or purge:
            folders = "/".join(local_path.split("/")[:-1])
            if not os.path.exists(folders):
                os.makedirs(folders)
            #download the file and serve
            dlfile(subpath + args, local_path)
            found = False
    except urllib2.URLError:
        print "Could not download file", subpath
    print "serving: " + local_path
    return local_path, found
Example #2
0
	def test_cacheUrl_uses_existing_domain(self):
		bad_domain = "www.thisisthedomain.com"
		good_domain = "https://www.thisisthecorrectdomain.com"
		self.assertEqual(cacheUrl(good_domain+"/hey", bad_domain), "/cache/"+good_domain+"/hey")
		good_domain = "http://www.thisisthecorrectdomain.com"
		self.assertEqual(cacheUrl(good_domain+"/hey", bad_domain), "/cache/"+good_domain+"/hey")
		good_domain = "//www.thisisthecorrectdomain.com";
		url = cacheUrl(good_domain+"/hey", bad_domain);
		self.assertEqual(url, "/cache/http://www.thisisthecorrectdomain.com/hey")
Example #3
0
def view_page(subpath):
    htmlfile, found = _check_cache(subpath, True)
    with open(htmlfile, 'r') as myfile:
        html = myfile.read()
    if found:
        return html
    #redirect css url() links to cache
    html = re.sub(r'url\(([^\)]*)\)',
                  lambda m: "url(" + cacheUrl(m.group(1), subpath) + ")", html)
    soup = BeautifulSoup(html, "lxml")
    #redirect image links to cache
    for link in soup.findAll('img'):
        src = link.get('src')
        if src:
            link['src'] = cacheUrl(src, subpath)
        srcset = link.get('srcset')
        if srcset:
            links = link['srcset'].split(",")
            new = []
            for l in links:
                l = l.strip()
                split = l.split(" ")
                split[0] = cache(split[0], subpath)
                new.append(" ".join(split))
            link['srcset'] = ", ".join(new)
    #redirect script links to cache
    for link in soup.findAll('script'):
        if 'src' in link:
            link['src'] = cacheUrl(ink['src'], subpath)
    #redirect CSS and other <link> tags to cache
    for link in soup.findAll('link'):
        href = link.get("href")
        if href:
            link['href'] = cacheUrl(href, subpath)
    #inject js up top in <head>
    script = soup.new_tag("script")
    with open("injected.js") as f:
        script.string = f.read().replace("#!<DOMAIN>!#", subpath)
    soup.head.insert(0, script)
    #write file to cache
    with open(htmlfile, 'w') as myfile:
        myfile.write(str(soup))
    return str(soup)
Example #4
0
	def test_cacheUrl_ignores_bogus_input(self):
		self.assertEqual(cacheUrl("", ""), None)
		self.assertEqual(cacheUrl("some_url", ""), None)
		self.assertEqual(cacheUrl(None, None), None)
		self.assertEqual(cacheUrl(None, "www.adomain.org"), None)
		self.assertEqual(cacheUrl("some_url", None), None)
Example #5
0
	def test_cacheUrl_removes_extra_slashes(self):
		domain = "www.thisisthedomain.com//"
		self.assertEqual(cacheUrl("/hey//", domain), "/cache/www.thisisthedomain.com/hey/")
Example #6
0
	def test_cacheUrl_starts_with_cache(self):
		domain = "www.thisisthedomain.com"
		self.assertEqual(cacheUrl("/hey", domain), "/cache/"+domain+"/hey")