def testUnicodeHTML1(self): here = os.path.dirname(__file__) before = codecs.open(os.path.join(here, 'euc-jp.html'), 'r','euc-jp').read() assert isinstance(before, unicode) after = slimmer.html_slimmer(before) assert isinstance(after, unicode)
def testUnicodeHTML1(self): here = os.path.dirname(__file__) before = codecs.open(os.path.join(here, 'euc-jp.html'), 'r','euc-jp').read() assert isinstance(before, str) after = slimmer.html_slimmer(before) assert isinstance(after, str)
def testUnicodeHTML2(self): here = os.path.dirname(__file__) before = codecs.open(os.path.join(here, 'utf-8.html'), 'r','utf-8').read() assert isinstance(before, unicode) after = slimmer.html_slimmer(before) assert isinstance(after, unicode) expect = u'<html><p>\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35\u0e04\u0e23\u0e31\u0e1a</p></html>' assert after == expect
def testUnicodeHTML2(self): here = os.path.dirname(__file__) before = codecs.open(os.path.join(here, 'utf-8.html'), 'r','utf-8').read() assert isinstance(before, str) after = slimmer.html_slimmer(before) assert isinstance(after, str) expect = '<html><p>\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e35\u0e04\u0e23\u0e31\u0e1a</p></html>' assert after == expect
def main(url, language): from newsman.processor import simplr title, content, images = simplr.convert(url, language) #import re #a = re.sub(">\s+<", "><", unicode(content)) from slimmer import html_slimmer content = html_slimmer(content) print "--------------------------------------------------------------------" print str(content)
def write_page(path, template, title, **args): if not title: title = 'Under The Radar' else: title = 'Under The Radar: ' + title with open_out(path) as f: f.write( slimmer.html_slimmer( templates.get_template('index.html').render( content=templates.get_template(template).render(args), title=title)))
def render(self, context): code = self.nodelist.render(context) if self.format == 'css': return css_slimmer(code) elif self.format in ('js', 'javascript'): return js_slimmer(code) elif self.format == 'html': return html_slimmer(code) else: format = guessSyntax(code) if format: self.format = format return self.render(context) return code
def render(self, context): code = self.nodelist.render(context) if slimmer is None: return code if self.format not in ('css','js','html','xhtml'): self.format = guessSyntax(code) if self.format == 'css': return css_slimmer(code) elif self.format in ('js', 'javascript'): return js_slimmer(code) elif self.format == 'xhtml': return xhtml_slimmer(code) elif self.format == 'html': return html_slimmer(code) return code
def render(self, context): code = self.nodelist.render(context) if slimmer is None: return code if self.format not in ('css','js','html','xhtml'): self.format = slimmer.guessSyntax(code) if self.format == 'css': return slimmer.css_slimmer(code) elif self.format in ('js', 'javascript'): return slimmer.js_slimmer(code) elif self.format == 'xhtml': return slimmer.xhtml_slimmer(code) elif self.format == 'html': return slimmer.html_slimmer(code) else: raise TemplateSyntaxError("Unrecognized format for slimming content") return code
def render(self, context): code = self.nodelist.render(context) if slimmer is None: return code if self.format not in ("css", "js", "html", "xhtml"): self.format = slimmer.guessSyntax(code) if self.format == "css": return slimmer.css_slimmer(code) elif self.format in ("js", "javascript"): return slimmer.js_slimmer(code) elif self.format == "xhtml": return slimmer.xhtml_slimmer(code) elif self.format == "html": return slimmer.html_slimmer(code) else: raise TemplateSyntaxError("Unrecognized format for slimming content") return code
def find_images(content=None, referer=None): """ find out all images from content and its size info """ if not content: logger.error('Content/HTML is found VOID!') return None, content try: if isinstance(content, str) or isinstance(content, unicode): soup = BeautifulSoup(content.decode('utf-8', 'ignore')) normalized_images = [] element_replaced = False for image in soup.findAll('img'): if image.get('src'): normalized_image = find_image(image.get('src'), referer) if normalized_image: # replace original image link with clean and (local) # copy if 'original_url' in normalized_image and \ normalized_image['original_url']: image['src'] = str(normalized_image['url']) element_replaced = True normalized_images.append(normalized_image) content_new = soup.prettify(encoding='utf-8') if element_replaced and content_new: content = str( html_slimmer(urllib2.unquote( hparser.unescape(content_new)))) return normalized_images, content else: logger.info("Wrong format %s" % content) return None, content except Exception as k: logger.error("Problem [%s] Source [%s]" % (str(k), content)) return None, content
def find_images(content=None, referer=None): """ find out all images from content and its size info """ if not content: logger.error('Content/HTML is found VOID!') return None, content try: if isinstance(content, str) or isinstance(content, unicode): soup = BeautifulSoup(content.decode('utf-8', 'ignore')) normalized_images = [] element_replaced = False for image in soup.findAll('img'): if image.get('src'): normalized_image = find_image(image.get('src'), referer) if normalized_image: # replace original image link with clean and (local) # copy if 'original_url' in normalized_image and \ normalized_image['original_url']: image['src'] = str(normalized_image['url']) element_replaced = True normalized_images.append(normalized_image) content_new = soup.prettify(encoding='utf-8') if element_replaced and content_new: content = str( html_slimmer( urllib2.unquote(hparser.unescape(content_new)))) return normalized_images, content else: logger.info("Wrong format %s" % content) return None, content except Exception as k: logger.error("Problem [%s] Source [%s]" % (str(k), content)) return None, content
def convert(language="en", title=None, link=None, updated=None, feed=None, transcoder="chengdujin", relative_path=None, stdout=False): """ select a transcoder send the link gather the data combine them with the template generate paths return news and images * stdout is to print result directly, no saving to physical disk related * stdout default value False """ if not language or not link: logger.error('Method malformed! language: %s link: %s' % (language, link)) if not stdout: return None, None, None, None else: return None, None try: link_clean = _preprocess(link) if link_clean: # this wont suck transcoders = _organize_transcoders(transcoder) title_new, content, images = _transcode(link_clean, transcoders, language) # remove null content content = content.strip() if content else None # in case no title is found from feed information if not title: title = title_new if content and title: # slimmer the content content = html_slimmer(content) if not stdout: # embed content in template news = _compose(language, title, updated, feed, _sanitize(content), images) if news: # create web/local path web_path, local_path = _save(news, relative_path) if web_path: # the FINAL return return web_path, local_path, content, images else: if not stdout: return None, None, None, None else: return None, None else: logger.error( 'Cannot combine content with the template!') if not stdout: return None, None, None, None else: return None, None else: return title, content else: if not content: logger.info('Transcoder %s failed for %s' % (transcoder, link_clean)) else: logger.info('Cannot find title for %s' % link_clean) if not stdout: # original link is returned as transcoded path logger.info('Original link %s is used as transcoded path') return link_clean, None, None, None else: return None, None else: logger.error('Link [clean %s] [original %s] cannot be parsed' % (link_clean, link)) if not stdout: return None, None, None, None else: return None, None except Exception as k: logger.error(str(k)) if not stdout: return None, None, None, None else: return None, None
for script in soup.findAll('script'): src = script.get('src') if (src != None): all_script += urllib.urlopen(src).read() else: all_script += script.text html = html.replace(str(script), '') total_size += len(all_script.encode('utf-8')) all_script += '</script>' all_minified_script = jsmin(all_script) html = html.replace('</body>', '%s</body>' % all_minified_script) # Minify the resulting html minified_html = html_slimmer(html) # Create cpp header string cpp_string = '#ifndef __webpage_h__\n' cpp_string += '#define _webpage_h__\n' cpp_string += 'PROGMEM extern const String html = R"~(' + minified_html.replace( '\n', '') + ')~";\n' cpp_string += '#endif' absolute_output_path = os.path.join(current_directory, output_path) amount_of_bytes = 0 with open(absolute_output_path, 'w+') as output_file: # Write string to header file output_file.write(cpp_string) output_file.flush() # Calculate the size of our file amount_of_bytes = os.path.getsize(absolute_output_path)
def generate(self, *args, **kw): return html_slimmer(self.template.generate(*args, **kw))
def minify_html_proc(content): try: return htmlmin.minify(content, remove_comments=True, remove_empty_space=True).encode('utf-8') except: return html_slimmer(content.strip().replace('\n',' ').replace('\t',' ').replace('\r',' '))
def convert(language="en", title=None, link=None, updated=None, feed=None, transcoder="chengdujin", relative_path=None, stdout=False): """ select a transcoder send the link gather the data combine them with the template generate paths return news and images * stdout is to print result directly, no saving to physical disk related * stdout default value False """ if not language or not link: logger.error('Method malformed! language: %s link: %s' % (language, link)) if not stdout: return None, None, None, None else: return None, None try: link_clean = _preprocess(link) if link_clean: # this wont suck transcoders = _organize_transcoders(transcoder) title_new, content, images = _transcode( link_clean, transcoders, language) # remove null content content = content.strip() if content else None # in case no title is found from feed information if not title: title = title_new if content and title: # slimmer the content content = html_slimmer(content) if not stdout: # embed content in template news = _compose( language, title, updated, feed, _sanitize(content), images) if news: # create web/local path web_path, local_path = _save(news, relative_path) if web_path: # the FINAL return return web_path, local_path, content, images else: if not stdout: return None, None, None, None else: return None, None else: logger.error( 'Cannot combine content with the template!') if not stdout: return None, None, None, None else: return None, None else: return title, content else: if not content: logger.info('Transcoder %s failed for %s' % (transcoder, link_clean)) else: logger.info('Cannot find title for %s' % link_clean) if not stdout: # original link is returned as transcoded path logger.info('Original link %s is used as transcoded path') return link_clean, None, None, None else: return None, None else: logger.error( 'Link [clean %s] [original %s] cannot be parsed' % ( link_clean, link)) if not stdout: return None, None, None, None else: return None, None except Exception as k: logger.error(str(k)) if not stdout: return None, None, None, None else: return None, None
def __slimmer(self, htmlFile): htmlFile = list( filter( lambda x: x[0:16] != '<link href="/css' and x[0:11] != '<script src', htmlFile)) return html_slimmer("".join(htmlFile), True)
# try to handle additional types unknown to mimetypes.guess_type() if fileext == ".tpl": mimetype = "text/html" else: mimetype = "application/octet-stream" # get raw file data with open(filepath, "rb") as fr: filedata = fr.read() oldfilesize = len(filedata) # can I remove CR, LF, Tabs? if do_slimmer: if fileext in [".tpl", ".html", ".htm"]: filedata = slimmer.html_slimmer(filedata) elif fileext in [".css"]: filedata = slimmer.css_slimmer(filedata) elif fileext in [".js"]: filedata = slimmer.js_slimmer(filedata) print "Adding {} mimetype = ({}) size = {} reduced size = {}".format(filename, mimetype, oldfilesize, len(filedata)) # filename length, mime tpye length, file content length fw.write(struct.pack("<BBH", len(filename) + 1, len(mimetype) + 1, len(filedata))) # filename data fw.write(struct.pack(str(len(filename)) + "sB", filename, 0x00)) # mime type data fw.write(struct.pack(str(len(mimetype)) + "sB", mimetype, 0x00))
import os,sys import json import datetime import time import calendar import arrow import os #Y-m-d H:i:s ms tz #print datetime.timedelta(3600*8) #print result html="" for line in open("test.html"): html+=line #print html import slimmer html=slimmer.js_slimmer(html) html=slimmer.html_slimmer(html) print html
# try to handle additional types unknown to mimetypes.guess_type() if fileext == ".tpl": mimetype = "text/html" else: mimetype = "application/octet-stream" # get raw file data with open(filepath, "rb") as fr: filedata = fr.read() oldfilesize = len(filedata) # can I remove CR, LF, Tabs? if do_slimmer: if fileext in [".tpl", ".html", ".htm"]: filedata = slimmer.html_slimmer(filedata) elif fileext in [".css"]: filedata = slimmer.css_slimmer(filedata) elif fileext in [".js"]: filedata = slimmer.js_slimmer(filedata) print "Adding {} mimetype = ({}) size = {} reduced size = {}".format( filename, mimetype, oldfilesize, len(filedata)) # flags fw.write(struct.pack("B", 0)) # filename length, mime tpye length, file content length fw.write( struct.pack("<BBH", len(filename) + 1,
import slimmer import sys if __name__ == '__main__': for fpath in sys.argv[1:]: with open(fpath, 'r') as fh: txt = fh.read() if fpath.endswith('.html'): txt = slimmer.html_slimmer(txt) elif fpath.endswith('.css'): txt = slimmer.css_slimmer(txt) elif fpath.endswith('.js'): txt = slimmer.js_slimmer(txt) else: print('unknown format of', fpath) with open(fpath, 'w') as fh: fh.write(txt)