def test_respect_link_order(self): html = os.path.join(HERE, 'three-links.html') url = 'file://' + html p = Processor() p.process(url) hrefs = [x.href for x in p.links] eq_(hrefs, ['two.css', 'three.css'])
def test_respect_link_order(self): html = os.path.join(HERE, "three-links.html") url = "file://" + html p = Processor() p.process(url) hrefs = [x.href for x in p.links] eq_(hrefs, ["two.css", "three.css"])
def test_pseudo_selectors_hell(self): html = os.path.join(HERE, "three.html") url = "file://" + html p = Processor(preserve_remote_urls=False) p.process(url) # two.html only has 1 link CSS ref link = p.links[0] after = link.after ok_("a.three:hover" in after) ok_("a.hundred:link" not in after) ok_(".container > a.one" in after) ok_(".container > a.notused" not in after) ok_('input[type="button"]' not in after) ok_('input[type="search"]::-webkit-search-decoration' in after) ok_('input[type="reset"]::-webkit-search-decoration' not in after) ok_("@media (max-width: 900px)" in after) ok_(".container .two" in after) ok_("a.four" not in after) ok_("::-webkit-input-placeholder" in after) ok_(":-moz-placeholder {" in after) ok_("div::-moz-focus-inner" in after) ok_("button::-moz-focus-inner" not in after) ok_("@-webkit-keyframes progress-bar-stripes" in after) ok_("from {" in after) # print after # some day perhaps this can be untangled and parsed too ok_("@import url(other.css)" in after)
def test_pseudo_selectors_hell(self): html = os.path.join(HERE, 'three.html') url = 'file://' + html p = Processor(preserve_remote_urls=False) p.process(url) # two.html only has 1 link CSS ref link = p.links[0] after = link.after self.assertTrue('a.three:hover' in after) self.assertTrue('a.hundred:link' not in after) self.assertTrue('.container > a.one' in after) self.assertTrue('.container > a.notused' not in after) self.assertTrue('input[type="button"]' not in after) self.assertTrue('input[type="search"]::-webkit-search-decoration' in after) self.assertTrue('input[type="reset"]::-webkit-search-decoration' not in after) self.assertTrue('@media (max-width: 900px)' in after) self.assertTrue('.container .two' in after) self.assertTrue('a.four' not in after) self.assertTrue('::-webkit-input-placeholder' in after) self.assertTrue(':-moz-placeholder {' in after) self.assertTrue('div::-moz-focus-inner' in after) self.assertTrue('button::-moz-focus-inner' not in after) self.assertTrue('@-webkit-keyframes progress-bar-stripes' in after) self.assertTrue('from {' in after) # some day perhaps this can be untangled and parsed too self.assertTrue('@import url(other.css)' in after)
def test_no_mincss_link(self): html = os.path.join(HERE, 'no-mincss-link.html') url = 'file://' + html p = Processor() p.process(url) link = p.links[0] eq_(link.before, link.after)
def run(): p = Processor() p.process(URL) # 输出INlink的css的简化前和简化后的css代码 print("INLINES ".ljust(79, '-')) for each in p.inlines: print("On line %s" % each.line) print('- ' * 40) print("BEFORE") print(each.before) print('- ' * 40) print("AFTER:") print(each.after) # 输出link引用的css的简化前和简化后的css代码 print("LINKS ".ljust(79, '-')) for each in p.links: print("On href %s" % each.href) print('- ' * 40) print("BEFORE") print(each.before) print('- ' * 40) print("AFTER:") print(each.after)
def _execute(self, options, args): """Apply mincss the generated site.""" output_folder = self.site.config['OUTPUT_FOLDER'] if Processor is None: print('To use the mincss command,' ' you have to install the "mincss" package.') return p = Processor(preserve_remote_urls=False) urls = [] css_files = {} for root, dirs, files in os.walk(output_folder): for f in files: url = os.path.join(root, f) if url.endswith('.css'): fname = os.path.basename(url) if fname in css_files: print("You have two CSS files with the same name and that confuses me.") sys.exit(1) css_files[fname] = url if not f.endswith('.html'): continue urls.append(url) p.process(*urls) for inline in p.links: fname = os.path.basename(inline.href) print("===>", inline.href, len(inline.before), len(inline.after)) with open(css_files[fname], 'wb+') as outf: outf.write(inline.after)
def test_pseudo_selectors_hell(self): html = os.path.join(HERE, 'three.html') url = 'file://' + html p = Processor(preserve_remote_urls=False) p.process(url) # two.html only has 1 link CSS ref link = p.links[0] after = link.after ok_('a.three:hover' in after) ok_('a.hundred:link' not in after) ok_('.container > a.one' in after) ok_('.container > a.notused' not in after) ok_('input[type="button"]' not in after) ok_('input[type="search"]::-webkit-search-decoration' in after) ok_('input[type="reset"]::-webkit-search-decoration' not in after) ok_('@media (max-width: 900px)' in after) ok_('.container .two' in after) ok_('a.four' not in after) ok_('::-webkit-input-placeholder' in after) ok_(':-moz-placeholder {' in after) ok_('div::-moz-focus-inner' in after) ok_('button::-moz-focus-inner' not in after) ok_('@-webkit-keyframes progress-bar-stripes' in after) ok_('from {' in after) # some day perhaps this can be untangled and parsed too ok_('@import url(other.css)' in after)
def _execute(self, options, args): """Apply mincss the generated site.""" output_folder = self.site.config['OUTPUT_FOLDER'] if Processor is None: req_missing(['mincss'], 'use the "mincss" command') return p = Processor(preserve_remote_urls=False) urls = [] css_files = {} for root, dirs, files in os.walk(output_folder): for f in files: url = os.path.join(root, f) if url.endswith('.css'): fname = os.path.basename(url) if fname in css_files: self.logger.error( "You have two CSS files with the same name and that confuses me." ) sys.exit(1) css_files[fname] = url if not f.endswith('.html'): continue urls.append(url) p.process(*urls) for inline in p.links: fname = os.path.basename(inline.href) with open(css_files[fname], 'wb+') as outf: outf.write(inline.after)
def _execute(self, options, args): """Apply mincss the generated site.""" output_folder = self.site.config['OUTPUT_FOLDER'] if Processor is None: req_missing(['mincss'], 'use the "mincss" command') return p = Processor(preserve_remote_urls=False) urls = [] css_files = {} for root, dirs, files in os.walk(output_folder, followlinks=True): for f in files: url = os.path.join(root, f) if url.endswith('.css'): fname = os.path.basename(url) if fname in css_files: self.logger.error("You have two CSS files with the same name and that confuses me.") sys.exit(1) css_files[fname] = url if not f.endswith('.html'): continue urls.append(url) p.process(*urls) for inline in p.links: fname = os.path.basename(inline.href) with open(css_files[fname], 'wb+') as outf: outf.write(inline.after)
def run(): p = Processor() for url in urls: p.process(url) for each in p.links: print each.after
def test_before_after(self): html = os.path.join(HERE, 'before-after.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after ok_('ul li:after { content: "x"; }' not in after) ok_('ol li:before { content: "x"; }' in after)
def test_complex_colons_in_selector_expression(self): html = os.path.join(HERE, 'complex-selector.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after ok_('a[href^="javascript:"] { color: pink; }' in after) ok_('a[href^="javascript:"]:after { content: "x"; }' in after)
def run(url): p = Processor() t0 = time.time() p.process(url) t1 = time.time() print("INLINES ".ljust(79, '-')) total_size_before = 0 total_size_after = 0 # for each in p.inlines: # print("On line %s" % each.line) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # total_size_after += len(each.after) # print("\n") # # print("LINKS ".ljust(79, '-')) # for each in p.links: # print("On href %s" % each.href) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # print("\n") # # print("LINKS ".ljust(79, '-')) # for each in p.links: # print("On href %s" % each.href) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # total_size_after += len(each.after) # print("\n") print( "TOOK:".ljust(20), "%.5fs" % (t1 - t0) ) print( "TOTAL SIZE BEFORE:".ljust(20), "%.1fKb" % (total_size_before / 1024.0) ) print( "TOTAL SIZE AFTER:".ljust(20), "%.1fKb" % (total_size_after / 1024.0) )
def test_non_ascii_html(self): html = os.path.join(HERE, "eight.html") url = "file://" + html p = Processor() p.process(url) after = p.inlines[0].after ok_(isinstance(after, unicode)) ok_(u"Varf\xf6r st\xe5r det h\xe4r?" in after)
def test_non_ascii_html(self): html = os.path.join(HERE, 'eight.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after self.assertTrue(isinstance(after, unicode)) self.assertTrue(u'Varf\xf6r st\xe5r det h\xe4r?' in after)
def test_non_ascii_html(self): html = os.path.join(HERE, 'eight.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after ok_(isinstance(after, unicode)) ok_(u'Varf\xf6r st\xe5r det h\xe4r?' in after)
def test_make_absolute_url(self): p = Processor() eq_(p.make_absolute_url("http://www.com/", "./style.css"), "http://www.com/style.css") eq_(p.make_absolute_url("http://www.com", "./style.css"), "http://www.com/style.css") eq_(p.make_absolute_url("http://www.com", "//cdn.com/style.css"), "http://cdn.com/style.css") eq_(p.make_absolute_url("http://www.com/", "//cdn.com/style.css"), "http://cdn.com/style.css") eq_(p.make_absolute_url("http://www.com/", "/style.css"), "http://www.com/style.css") eq_(p.make_absolute_url("http://www.com/elsewhere", "/style.css"), "http://www.com/style.css") eq_(p.make_absolute_url("http://www.com/elsewhere/", "/style.css"), "http://www.com/style.css") eq_(p.make_absolute_url("http://www.com/elsewhere/", "./style.css"), "http://www.com/elsewhere/style.css") eq_(p.make_absolute_url("http://www.com/elsewhere", "./style.css"), "http://www.com/style.css")
def test_complicated_keyframes(self): html = os.path.join(HERE, "six.html") url = "file://" + html p = Processor() p.process(url) after = p.inlines[0].after eq_(after.count("{"), after.count("}")) ok_(".pull-left" in after) ok_(".pull-right" in after) ok_(".pull-middle" not in after)
def test_complicated_keyframes(self): html = os.path.join(HERE, 'six.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after eq_(after.count('{'), after.count('}')) ok_('.pull-left' in after) ok_('.pull-right' in after) ok_('.pull-middle' not in after)
def gencss(htmldir, htmlfiles): fullpaths = map(lambda x: join(htmldir, x), htmlfiles) basenames = map(lambda x: x.split('.')[0], htmlfiles) cssdir = '../www/css/' p = Processor(optimize_lookup=True) for f, b in zip(fullpaths, basenames): p.process(f) for css in p.links: cssfile = join(cssdir, b) + ".css" with open(cssfile, 'wb') as fh: fh.write(css.after)
def test_duplicate_media_queries(self): """if two media queries look exactly the same, it shouldn't fail. This is kinda hackish but it desperately tries to solve https://github.com/peterbe/mincss/issues/46 """ html = os.path.join(HERE, 'duplicate-media-queries.html') url = 'file://' + html p = Processor() p.process(url) snippet = '@media screen and (min-width: 600px) {' eq_(p.inlines[0].after.count(snippet), 2)
def test_double_classes(self): html = os.path.join(HERE, 'five.html') url = 'file://' + html p = Processor() p.process(url) after = p.links[0].after eq_(after.count('{'), after.count('}')) ok_('input.span6' in after) ok_('.uneditable-input.span9' in after) ok_('.uneditable-{' not in after) ok_('.uneditable-input.span3' not in after)
def test_double_classes(self): html = os.path.join(HERE, "five.html") url = "file://" + html p = Processor() p.process(url) after = p.links[0].after eq_(after.count("{"), after.count("}")) ok_("input.span6" in after) ok_(".uneditable-input.span9" in after) ok_(".uneditable-{" not in after) ok_(".uneditable-input.span3" not in after)
def test_media_query_simple(self): html = os.path.join(HERE, 'four.html') url = 'file://' + html p = Processor() p.process(url) link = p.links[0] after = link.after ok_('/* A comment */' in after, after) ok_('@media (max-width: 900px) {' in after, after) ok_('.container .two {' in after, after) ok_('.container .nine {' not in after, after) ok_('a.four' not in after, after)
def test_media_query_simple(self): html = os.path.join(HERE, "four.html") url = "file://" + html p = Processor() p.process(url) link = p.links[0] after = link.after ok_("/* A comment */" in after, after) ok_("@media (max-width: 900px) {" in after, after) ok_(".container .two {" in after, after) ok_(".container .nine {" not in after, after) ok_("a.four" not in after, after)
def run(url): p = Processor() t0 = time.time() p.process(url) t1 = time.time() print("INLINES ".ljust(79, '-')) total_size_before = 0 total_size_after = 0 # for each in p.inlines: # print("On line %s" % each.line) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # total_size_after += len(each.after) # print("\n") # # print("LINKS ".ljust(79, '-')) # for each in p.links: # print("On href %s" % each.href) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # print("\n") # # print("LINKS ".ljust(79, '-')) # for each in p.links: # print("On href %s" % each.href) # print('- ' * 40) # print("BEFORE") # print(each.before) # total_size_before += len(each.before) # print('- ' * 40) # print("AFTER:") # print(each.after) # total_size_after += len(each.after) # print("\n") print("TOOK:".ljust(20), "%.5fs" % (t1 - t0)) print("TOTAL SIZE BEFORE:".ljust(20), "%.1fKb" % (total_size_before / 1024.0)) print("TOTAL SIZE AFTER:".ljust(20), "%.1fKb" % (total_size_after / 1024.0))
def test_preserve_remote_urls(self): html = os.path.join(HERE, 'nine.html') url = 'file://' + html p = Processor(preserve_remote_urls=True) p.process(url) after = p.links[0].after ok_("url('http://www.google.com/north.png')" in after) url = 'file://' + HERE + '/deeper/south.png' ok_('url("%s")' % url in after) # since local file URLs don't have a domain, this is actually expected ok_('url("file:///east.png")' in after) url = 'file://' + HERE + '/west.png' ok_('url("%s")' % url in after)
def test_preserve_remote_urls(self): html = os.path.join(HERE, "nine.html") url = "file://" + html p = Processor(preserve_remote_urls=True) p.process(url) after = p.links[0].after ok_("url('http://www.google.com/north.png')" in after) url = "file://" + HERE + "/deeper/south.png" ok_('url("%s")' % url in after) # since local file URLs don't have a domain, this is actually expected ok_('url("file:///east.png")' in after) url = "file://" + HERE + "/west.png" ok_('url("%s")' % url in after)
def test_ignore_annotations(self): html = os.path.join(HERE, 'seven.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after eq_(after.count('{'), after.count('}')) ok_('/* Leave this comment as is */' in after) ok_('/* Lastly leave this as is */' in after) ok_('/* Also stick around */' in after) ok_('/* leave untouched */' in after) ok_('.north' in after) ok_('.south' in after) ok_('.east' not in after) ok_('.west' in after) ok_('no mincss' not in after)
def test_ignore_annotations(self): html = os.path.join(HERE, "seven.html") url = "file://" + html p = Processor() p.process(url) after = p.inlines[0].after eq_(after.count("{"), after.count("}")) ok_("/* Leave this comment as is */" in after) ok_("/* Lastly leave this as is */" in after) ok_("/* Also stick around */" in after) ok_("/* leave untouched */" in after) ok_(".north" in after) ok_(".south" in after) ok_(".east" not in after) ok_(".west" in after) ok_("no mincss" not in after)
def run(args): options = {'debug': args.verbose} if args.phantomjs_path: options['phantomjs'] = args.phantomjs_path elif args.phantomjs: options['phantomjs'] = True p = Processor(**options) t0 = time.time() p.process(args.url) t1 = time.time() print("TOTAL TIME ", t1 - t0) for inline in p.inlines: print("ON", inline.url) print("AT line", inline.line) print("BEFORE ".ljust(79, '-')) print(inline.before) print("AFTER ".ljust(79, '-')) print(inline.after) print() output_dir = args.outputdir if not os.path.isdir(output_dir): os.mkdir(output_dir) for link in p.links: print("FOR", link.href) #print("BEFORE ".ljust(79, '-')) #print(link.before) #print("AFTER ".ljust(79, '-')) #print(link.after) orig_name = link.href.split('/')[-1] fn = os.path.join(output_dir, orig_name) with codecs.open(fn, 'w', 'utf-8') as f: f.write(link.after) before_name = 'before_' + link.href.split('/')[-1] fn = os.path.join(output_dir, before_name) with codecs.open(fn, 'w', 'utf-8') as f: f.write(link.before) print("Files written to", output_dir) print() print( '(from %d to %d saves %d)' % (len(link.before), len(link.after), len(link.before) - len(link.after)) ) return 0
def test_nth_child(self): html = os.path.join(HERE, 'nth-child.html') url = 'file://' + html p = Processor() p.process(url) after = p.inlines[0].after # These mouse related one should stay, even though they're # currently NOT being acted upon with some input device. ok_('a.actually:hover { font-weight: bold; }' in after) ok_('a.actually:visited { font-weight: bold; }' in after) ok_('a.actually:link { font-weight: bold; }' in after) ok_('a.actually:focus { font-weight: bold; }' in after) ok_('a.actually:active { font-weight: bold; }' in after) # the other selectors with : in them should also stay ok_('div > :first-child { color: pink; }' in after) ok_('div > :last-child { color: brown; }' in after) ok_('div > :not(p) { color: blue; }' in after) ok_('div > :nth-child(2) { color: red; }' in after)
def run(): p = Processor() p.process(URL) print "/* LINKS ".ljust(797, '-') for each in p.links: print ("/* On href %s */" % each.href) print print each.after print print print "/* INLINES ".ljust(77, '-') + "*/" for each in p.inlines: print ("/* On line %s */" % each.line) print print each.after print print
def test_just_one_link(self): html = os.path.join(HERE, 'two.html') url = 'file://' + html p = Processor() p.process(url) # two.html only has 1 link CSS ref link = p.links[0] eq_(link.href, 'two.css') ok_(len(link.after) < len(link.before)) lines_after = link.after.splitlines() # compare line by line expect = ''' body, html { margin: 0; } h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_download_with_phantomjs(self): html = os.path.join(HERE, "one.html") url = "file://" + html p = Processor(phantomjs=PHANTOMJS, phantomjs_options={"cookies-file": "bla"}) p.process(url) # on line 7 there inline css starts # one.html only has 1 block on inline CSS inline = p.inlines[0] lines_after = inline.after.strip().splitlines() eq_(inline.line, 7) ok_(len(inline.after) < len(inline.before)) # compare line by line expect = """ h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } """ for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_just_inline(self): html = os.path.join(HERE, 'one.html') url = 'file://' + html p = Processor() p.process(url) # on line 7 there inline css starts # one.html only has 1 block on inline CSS inline = p.inlines[0] lines_after = inline.after.strip().splitlines() eq_(inline.line, 7) ok_(len(inline.after) < len(inline.before)) # compare line by line expect = ''' h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_download_with_phantomjs(self): html = os.path.join(HERE, 'one.html') url = 'file://' + html p = Processor(phantomjs=PHANTOMJS, phantomjs_options={'cookies-file': 'bla'}) p.process(url) # on line 7 there inline css starts # one.html only has 1 block on inline CSS inline = p.inlines[0] lines_after = inline.after.strip().splitlines() eq_(inline.line, 7) ok_(len(inline.after) < len(inline.before)) # compare line by line expect = ''' h1, h2, h3 { text-align: center; } h3 { font-family: serif; } h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def run(args): options = {'debug': args.verbose} if args.phantomjs_path: options['phantomjs'] = args.phantomjs_path elif args.phantomjs: options['phantomjs'] = True p = Processor(**options) t0 = time.time() p.process(args.url) t1 = time.time() print('TOTAL TIME ', t1 - t0) for inline in p.inlines: print('ON', inline.url) print('AT line', inline.line) print('BEFORE '.ljust(79, '-')) print(inline.before) print('AFTER '.ljust(79, '-')) print(inline.after) print() output_dir = args.outputdir if not os.path.isdir(output_dir): os.mkdir(output_dir) for link in p.links: print('FOR', link.href) orig_name = link.href.split('/')[-1] with io.open(os.path.join(output_dir, orig_name), 'w') as f: f.write(link.after) before_name = 'before_' + link.href.split('/')[-1] with io.open(os.path.join(output_dir, before_name), 'w') as f: f.write(link.before) print('Files written to', output_dir) print() print( '(from %d to %d saves %d)' % (len(link.before), len(link.after), len(link.before) - len(link.after)) ) return 0
def test_one_link_two_different_pages(self): html = os.path.join(HERE, 'two.html') url1 = 'file://' + html html_half = os.path.join(HERE, 'two_half.html') url2 = 'file://' + html_half p = Processor() p.process(url1, url2) # two.html only has 1 link CSS ref link = p.links[0] eq_(link.href, 'two.css') #eq_(link.url, url1.replace('.html', '.css')) ok_(len(link.after) < len(link.before)) lines_after = link.after.splitlines() # compare line by line expect = ''' body, html { margin: 0; } h1, h2, h3 { text-align: center; } h3 { font-family: serif; } .foobar { delete:me } .foobar, h2 { color:red } ''' for i, line in enumerate(expect.strip().splitlines()): eq_(line.strip(), lines_after[i].strip())
def test_html_with_totally_empty_style_tag(self): html = os.path.join(HERE, 'one-3.html') url = 'file://' + html p = Processor() p.process(url) eq_(p.inlines, [])
def test_make_absolute_url(self): p = Processor()
def test_no_mincss_inline(self): html = os.path.join(HERE, 'no-mincss-inline.html') url = 'file://' + html p = Processor() p.process(url) eq_(p.inlines[0].before, p.inlines[0].after)
def test_ignore_inline(self): html = os.path.join(HERE, 'ignore-inline.html') url = 'file://' + html p = Processor() p.process(url) assert not p.inlines
def process(urls): p = Processor() p.process(*urls) return p
def proxy(path): if path == 'favicon.ico': abort(404) url = path if not path.count('://'): url = 'http://' + url query = urlparse.urlparse(request.url).query if query: url += '?%s' % query logging.info('Downloading %s' % url) t0 = time.time() html = download(url) t1 = time.time() print "%.4f seconds to download" % (t1 - t0) p = Processor(debug=False, optimize_lookup=True) # since we've already download the HTML t0 = time.time() p.process_html(html, url) t1 = time.time() p.process() t2 = time.time() print "%.4f seconds to parse and process" % (t2 - t1) collect_stats = request.args.get('MINCSS_STATS', False) stats = [] css_url_regex = re.compile('url\(([^\)]+)\)') def css_url_replacer(match, href=None): filename = match.groups()[0] bail = match.group() if ((filename.startswith('"') and filename.endswith('"')) or (filename.startswith("'") and filename.endswith("'"))): filename = filename[1:-1] if 'data:image' in filename or '://' in filename: return bail if filename == '.': # this is a known IE hack in CSS return bail #if not filename.startswith('/'): # filename = os.path.normpath( # os.path.join( # os.path.dirname(href), # filename # ) # ) new_filename = urlparse.urljoin(url, filename) return 'url("%s")' % new_filename for i, each in enumerate(p.inlines): # this should be using CSSSelector instead new_inline = each.after new_inline = css_url_regex.sub( functools.partial(css_url_replacer, href=url), new_inline) stats.append(('inline %s' % (i + 1), each.before, each.after)) html = html.replace(each.before, new_inline) parser = etree.HTMLParser() stripped = html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. was_doctype = tree.docinfo.doctype #root = tree if stripped.startswith(tree.docinfo.doctype) else page links = dict((x.href, x) for x in p.links) #all_lines = html.splitlines() for link in CSSSelector('link')(page): if (link.attrib.get('rel', '') == 'stylesheet' or link.attrib['href'].lower().split('?')[0].endswith('.css')): hash_ = hashlib.md5(url + link.attrib['href']).hexdigest()[:7] now = datetime.date.today() destination_dir = os.path.join( CACHE_DIR, str(now.year), str(now.month), str(now.day), ) mkdir(destination_dir) new_css = links[link.attrib['href']].after stats.append( (link.attrib['href'], links[link.attrib['href']].before, links[link.attrib['href']].after)) new_css = css_url_regex.sub( functools.partial(css_url_replacer, href=link.attrib['href']), new_css) destination = os.path.join(destination_dir, hash_ + '.css') with codecs.open(destination, 'w', 'utf-8') as f: f.write(new_css) link.attrib['href'] = ('/cache%s' % destination.replace(CACHE_DIR, '')) for img in CSSSelector('img, script')(page): if 'src' in img.attrib: orig_src = urlparse.urljoin(url, img.attrib['src']) img.attrib['src'] = orig_src for a in CSSSelector('a')(page): if 'href' not in a.attrib: continue href = a.attrib['href'] if ('://' in href or href.startswith('#') or href.startswith('javascript:')): continue if href.startswith('/'): a.attrib['href'] = ( '/' + urlparse.urljoin(url, a.attrib['href']).replace('http://', '')) #else: if collect_stats: a.attrib['href'] = add_collect_stats_qs(a.attrib['href'], collect_stats) html = etree.tostring(page, method='html') if collect_stats: html = re.sub('<body[^>]*>', lambda m: m.group() + summorize_stats_html(stats), html, flags=re.I | re.M, count=1) return (was_doctype and was_doctype or '') + '\n' + html
def test_ignore_link(self): html = os.path.join(HERE, 'ignore-link.html') url = 'file://' + html p = Processor() p.process(url) assert not p.links
def test_make_absolute_url(self): p = Processor() eq_( p.make_absolute_url('http://www.com/', './style.css'), 'http://www.com/style.css' ) eq_( p.make_absolute_url('http://www.com', './style.css'), 'http://www.com/style.css' ) eq_( p.make_absolute_url('http://www.com', '//cdn.com/style.css'), 'http://cdn.com/style.css' ) eq_( p.make_absolute_url('http://www.com/', '//cdn.com/style.css'), 'http://cdn.com/style.css' ) eq_( p.make_absolute_url('http://www.com/', '/style.css'), 'http://www.com/style.css' ) eq_( p.make_absolute_url('http://www.com/elsewhere', '/style.css'), 'http://www.com/style.css' ) eq_( p.make_absolute_url('http://www.com/elsewhere/', '/style.css'), 'http://www.com/style.css' ) eq_( p.make_absolute_url('http://www.com/elsewhere/', './style.css'), 'http://www.com/elsewhere/style.css' ) eq_( p.make_absolute_url('http://www.com/elsewhere', './style.css'), 'http://www.com/style.css' )