def document_collection_recursive(resource, path, root_discovery, discovery): html = document_collection(resource, path, root_discovery, discovery) f = open(os.path.join(FLAGS.dest, path + "html"), "w") if sys.version_info.major < 3: html = html.encode("utf-8") f.write(html) f.close() for name in dir(resource): if ( not name.startswith("_") and callable(getattr(resource, name)) and hasattr(getattr(resource, name), "__is_resource__") and discovery != {} ): dname = name.rsplit("_")[0] collection = getattr(resource, name)() document_collection_recursive( collection, path + name + ".", root_discovery, discovery["resources"].get(dname, {}), )
def html_result(html: str, extraheaders: TYPE_WSGI_RESPONSE_HEADERS = None) \ -> WSGI_TUPLE_TYPE: """ Returns ``(contenttype, extraheaders, data)`` tuple for UTF-8 HTML. """ extraheaders = extraheaders or [] return 'text/html; charset=utf-8', extraheaders, html.encode("utf-8")
def clean_html(html_str): html = re.sub("[\n\r\t]+", " ", html_str) html = htmlp.unescape(html.decode("utf-8")) #remove html entities html = html.encode("utf-8") html = re.sub("(<!--.*?-->)", "", html) #remove HTML comments html = re.sub('(?i)<script.*?</script>', '', html) #remove javascript html = re.sub('(?i)<style.*?</style>', '', html) #remove css notags = re.sub("<.*?>", " ", html) #remove tags return notags
def render_to_pdf(template_src, context_dict={}): template = get_template(template_src) html = template.render(context_dict) result = BytesIO() pdf = pisa.pisaDocument(BytesIO(html.encode("windows-1251")), result, link_callback=link_callback) if not pdf.err: return HttpResponse(result.getvalue(), content_type='application/pdf') return HttpResponse("Error Rendering PDF", status=400)
def write_out(filename, html): """ Write a file, including a gzipped version, to the out_dir """ writefile(config.out_dir+filename, html) f = gzip.open(config.out_dir+filename+".gz", 'wb') try: # XXX HACK # The whole unicode issue is a complete fuckup as of now. f.write(html.encode("utf8")) except UnicodeDecodeError: f.write(html) f.close()
def get_thumbnail_url(doc, pagenumber, small): # Returns a URL to a thumbnail image for a particular page of the document. # 'small' is a boolean. # If the document is on DocumentCloud, get the URL to DocumentCloud's thumbnail image. documentcloud_id = get_documentcloud_document_id(doc) if documentcloud_id: # We can use the DocumentCloud API to get the URL to a thumbnail, but in the # interests of speed, construct the URL ourselves. #return query_documentcloud_api(documentcloud_id)["document"]["resources"]["page"]["image"].format( # page=pagenumber, # size="small" if small else "normal", #) return "https://assets.documentcloud.org/documents/%s/pages/%s-p%d-%s.gif" % ( documentcloud_id[0], documentcloud_id[1], pagenumber, "small" if small else "normal") # If it's a Markdown document, download it, convert it to HTML, then render it to # a PDF, and then to an image, and return that image as a data: URL. elif doc.get("format") == "markdown" and os.path.exists("/usr/bin/htmldoc") and os.path.exists("/usr/bin/pdftoppm"): # Download the Markdown file. md = get_document_text(doc, pagenumber) # If we got it... if md: import subprocess, base64 # Render the Markdown as HTML. html = CommonMark.commonmark(md) # Render the HTML as a PDF. # TODO: Possible security issue if the Markdown source can generate HTML that # causes htmldoc to perform network requests or possibly unsafe operations. pdf = subprocess.check_output(["/usr/bin/htmldoc", "--quiet", "--continuous", "--size", "4.5x5.8in", # smaller page magnifies the text "--top", "0", "--right", "1cm", "--bottom", "1cm", "--left", "1cm", # margins "-t", "pdf14", "-"], input=html.encode("utf8")) # Render the PDF and a PNG. png = subprocess.check_output(["/usr/bin/pdftoppm", "-singlefile", "-r", "60", "-png"], input=pdf) # Return a data: URL so we don't have to store/host the image anywhere, # but we can display it directly. return "data:image/png;base64," + base64.b64encode(png).decode("ascii") # No thumbnail image is available for this resource. return None
def html_to_data_uri(html, js_callback=None): # This function is called in two ways: # 1. From Python: in this case value is returned # 2. From Javascript: in this case value cannot be returned because # inter-process messaging is asynchronous, so must return value # by calling js_callback. html = html.encode("utf-8", "replace") b64 = base64.b64encode(html).decode("utf-8", "replace") ret = "data:text/html;base64,{data}".format(data=b64) if js_callback: js_print(js_callback.GetFrame().GetBrowser(), "Python", "html_to_data_uri", "Called from Javascript. Will call Javascript callback now.") js_callback.Call(ret) else: return ret
def get_soup(self, url): filepath = os.path.join(self.data_dir, url.replace(self.base_url, "")) if filepath.endswith("/"): filepath += "index.html" if os.path.exists(filepath): if self.debug: print "visited url" f = open(filepath) html = f.read() soup = BeautifulSoup(html) return soup if self.debug: print "new url" time.sleep(self.wait) html = requests.get(url).text if not os.path.exists(os.path.dirname(filepath)): os.makedirs(os.path.dirname(filepath)) f = open(filepath, "w") f.write(html.encode("utf8")) f.close() soup = BeautifulSoup(html) return soup
def respond_with_html(request: Request, code: int, html: str) -> None: """ Wraps `respond_with_html_bytes` by first encoding HTML from a str to UTF-8 bytes. """ respond_with_html_bytes(request, code, html.encode("utf-8"))
def format_display_name(account, **options): html = encode(account.display_name.presence or account.username) if options["custom_emojify"]: html = encode_custom_emojis(html, account.emojis) return html #.html_safe # rubocop:disable Rails/OutputSafety
def format_spoiler(status): html = encode(status.spoiler_text) html = encode_custom_emojis(html, status.emojis) #html.html_safe # rubocop:disable Rails/OutputSafety return html
# end # result << encode(chars[last_index..-1].join) # result.flatten.join # end def link_to_url(entity, options = {}) url = Addressable::URI.parse(entity[:url]) html_attrs = { target: '_blank', rel: 'nofollow noopener' } html_attrs[:rel] = "me #{html_attrs[:rel]}" if options[:me] Twitter::Autolink.send(:link_to_text, entity, link_html(entity[:url]), url, html_attrs) rescue Addressable::URI::InvalidURIError, IDN::Idna::IdnaError encode(entity[:url]) end def link_to_mention(entity, linkable_accounts) acct = entity[:screen_name] return link_to_account(acct) unless linkable_accounts account = linkable_accounts.find { |item| TagManager.instance.same_acct?(item.acct, acct) } account ? mention_html(account) : "@#{acct}" end def link_to_account(acct) username, domain = acct.split('@') domain = nil if TagManager.instance.local_domain?(domain)