def run(self, cls): try: soup = cls.content # to HTML5 [ item.extract() for item in soup.contents if isinstance(item, Doctype) ] soup.insert(0, Doctype('html')) soup.html.attrs = {} soup.html['lang'] = 'pt-br' if not soup.head: soup.html.insert(0, soup.new_tag('head')) # HTML attrs soup.html.attrs = {} # Flagging epigrafe soup.select_one('p:nth-of-type(1)').attrs = {'class': 'epigrafe'} # Flagging ementa soup.select_one('p:nth-of-type(2)').attrs = {'class': 'ementa'} cls.content = soup except AttributeError: return False else: return True
def strip_html(path, i, label_xid=True): """Strip the HTML: get rid of scripts and interactions""" print '[{}] Reading {} ...'.format(i, path) with open(path, 'r', 'utf8') as fin: # TODO: Handle encodings soup = BeautifulSoup(fin.read(), 'html5lib') # Add doctype if missing if not has_doctype(soup): soup.insert(0, Doctype('html')) # Remove dangerous tags for x in soup('script'): x.extract() for x in soup('noscript'): x.extract() for x in soup('link'): if x.get('as') == 'script': x.extract() for x in soup('iframe'): x['src'] = '' # Fix styles for x in soup('style'): x.string = H.unescape(u"".join(unicode(y) for y in x.contents)) # Label all tags i = 1 for x in soup.body(True): for attr in list(x.attrs): if attr.startswith('on') or attr == 'srcset': del x[attr] if label_xid: x['data-xid'] = i i += 1 # Return return soup.prettify()
def response(self, flow: http.HTTPFlow): response = flow.response if CONTENT_TYPE in response.headers: if any( map(lambda t: t in response.headers[CONTENT_TYPE], RELEVANT_CONTENT_TYPES)): # Response is a web page; proceed. insertedScripts: List[str] = [] soup = BeautifulSoup(response.content, HTML_PARSER, from_encoding=inferEncoding(response)) requestURL = flow.request.pretty_url # should work in transparent mode too, unless the Host header is spoofed isApplicable: Callable[[Userscript], bool] = userscript.applicableChecker( requestURL) for script in self.userscripts: if isApplicable(script): useInline = ctx.options.inline or script.downloadURL is None if useInline and len(script.unsafeSequences) > 0: logError(unsafeSequencesMessage(script)) continue logInfo( f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ...""" ) result = inject( script, soup, Options( inline=ctx.options.inline, verbose=ctx.options.verbose, )) if type(result) is BeautifulSoup: soup = result insertedScripts.append(script.name + ( "" if script.version is None else " " + stringifyVersion(script.version))) else: logError( "Injection failed due to the following error:") logError(str(result)) index_DTD: Optional[int] = indexOfDTD(soup) # Insert information comment: if ctx.options.verbose: soup.insert( 0 if index_DTD is None else 1 + index_DTD, Comment(INFO_COMMENT_PREFIX + ("No matching userscripts for this URL." if insertedScripts == [] else "These scripts were inserted:\n" + bulletList(insertedScripts)) + "\n")) # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML: if index_DTD is not None and REGEX_DOCTYPE.match( soup.contents[index_DTD]): # There is a DTD and it is invalid, so replace it. soup.contents[index_DTD] = Doctype( re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD])) # Serialize and encode: response.content = str(soup).encode( fromOptional(soup.original_encoding, CHARSET_DEFAULT), "replace")
def clean_text(self): text = self.cleaned_data["text"] soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) return str(soup)
def bare_bones(site_name="", **kwargs): soup = BeautifulSoup("", "html.parser") soup.append(Doctype("html")) html = soup.new_tag("html", attrs={"lang": "en"}) soup.append(html) html.append(soup.new_tag("head")) html.append(soup.new_tag("body", id="content")) apply_config(soup, site_name) return soup
def get_html_listing_soup( in_folder: Union[Path, str], page_title: Optional[str] = None, out_file: Optional[Union[Path, str]] = None, ) -> BeautifulSoup: in_folder = Path(in_folder) soup = BeautifulSoup("", "html5lib") cast(Tag, soup.find("html"))["lang"] = "en" soup.insert(0, Doctype("html")) if page_title is None: page_title = in_folder.stem head = cast(Tag, soup.find("head")) title = soup.new_tag("title") title.string = page_title head.append(title) body = cast(Tag, soup.find("body")) ul: Tag = soup.new_tag("ul") body.append(ul) now_sec = int(time.time()) inlined_suffix_regex = re.compile(r"_inlined$") li: Tag for demo_full_path in sorted(in_folder.glob("**/*.html")): if demo_full_path.is_dir() or demo_full_path.name == "index.html": continue li = soup.new_tag("li") ul.append(li) demo_relative_path = urllib.parse.quote(str( demo_full_path.relative_to(in_folder)), safe="/") a = soup.new_tag( "a", href=(f"./{demo_relative_path}?t={now_sec}"), ) demo_name = inlined_suffix_regex.sub("", demo_full_path.stem) a.string = demo_name li.append(a) if out_file is None: out_file = in_folder / "index.html" _ = Path(out_file).write_text(str(soup)) return soup
def __init__(self): super(OutputSoup, self).__init__(features='lxml') self.append(Doctype('html')) html = self.new_tag('html') self.append(html) head = self.new_tag('head') html.append(head) title = self.new_tag('title') title.string = 'TravellingKleinanzeigenProblem' head.append(title) self.body = self.new_tag('body') html.append(self.body)
def clean_text(self): text = self.cleaned_data["text"] soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) imgid = 0 for img in soup.findAll("img"): img["id"] = "img%s" % imgid imgid += 1 return str(soup)
def generateCData(htmlIn): """ This returns noteProps, and html soup to be reduced to a CData string. Meta-ToDo: Maybe this should be a method in a class with methods to handle different resource types? ToDo: This'll work for img tags but we're going to have to make it more general to handle resources other than images. ToDo: It'd be good to replace any <a href="evernote:///view..."s with URLs that Joplin can make something of. ToDo: remember to strip out disallowed tags and/or attributes. ToDo: return title as a string instead of an item in noteProps? """ basepath, htmlfilename = os.path.split(htmlIn.rstrip(os.path.sep)) soup = BeautifulSoup(open(htmlIn, 'r'), 'xml') noteProps = {'note-title': soup.find('title').text} for img in soup.find_all("img"): # assemble image properties img_path = basepath + img['src'][1:] pic = Image.open(img_path) width, height = pic.size imghash, base64block = file_to_base64(img_path) mimetype = guess_type(img_path) # make <en-media/> tag and replace img tag with it enmedia = Tag(name="en-media", attrs={ 'hash': imghash, 'type': mimetype }) img.replaceWith(enmedia) # generate entry in noteProps noteProps[imghash] = { 'filename': img['src'][2:], 'path': img_path, 'type': mimetype, 'width': str(width), 'height': str(height), 'data': base64block } # ToDo: more soup tinkering to create the CData string for t in soup: # do I need this loop? if isinstance(t, Doctype): t.replaceWith( Doctype( 'en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"')) soup.html.unwrap() soup.head.decompose() soup.body.name = "en-note" return soup, noteProps
def get_new_doc(args): doc = BeautifulSoup() doc.append(Doctype('html')) html_local = doc.new_tag('html', lang='en-US') doc.append(html_local) head = doc.new_tag('head') html_local.append(head) meta = doc.new_tag('meta', charset='utf-8') head.append(meta) title = doc.new_tag('title') title.string = get_title(args) head.append(title) body = doc.new_tag('body') html_local.append(body) return doc, html_local, head, body
def save(self, commit=True): m = super(MailWithAttachmentForm, self).save(commit=False) soup = BeautifulSoup(m.text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) m.text = str(soup) m.template_type = "2" if commit: m.save() return m
def create_note(note_data, soup): """Create an ENEX note element""" note = soup.new_tag('note') title = soup.new_tag('title') title.string = note_data.title note.append(title) content_inside = BeautifulSoup(features="xml") content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"')) content_inside_note = soup.new_tag('en-note') content_inside_note.string = note_data.content content_inside.append(content_inside_note) # Holy crap this is super hacky and horrible but I don't want to fight with # BeautifulSoup to make it not convert all the text to HTML entities, so # manually convert everything to < and > content_inside_str = str(content_inside).replace('<', '<').replace('>', '>') content = soup.new_tag('content') content.string = CData(content_inside_str) note.append(content) created = soup.new_tag('created') created.string = str(note_data.created) note.append(created) updated = soup.new_tag('updated') updated.string = str(note_data.updated) note.append(updated) for single_tag in note_data.tags: if single_tag is not None: tag = soup.new_tag('tag') tag.string = single_tag note.append(tag) attributes = soup.new_tag('note-attributes') author = soup.new_tag('author') author.string = "Andrew Heiss" attributes.append(author) note.append(attributes) return note
def clean_text(self): text = self.cleaned_data["text"] soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) forms = soup.findAll("form") if not forms or len(forms) > 1: raise ValidationError(_("The template must contain one form")) if not forms[0].findAll("input", attrs={"type": "submit"}): raise ValidationError(_("The form must have a submit button")) imgid = 0 for img in soup.findAll("img"): img["id"] = "img%s" % imgid imgid += 1 return str(soup)
def generate_enex(): # Note data structure Note = namedtuple('Note', ['title', 'content', 'created', 'updated', 'tags']) # Generate empty XML document soup = BeautifulSoup(features="xml") soup.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"')) # Everything is wrapped in <en-export> root_tag = soup.new_tag("en-export") # Parse each note original_notes = glob.glob(note_files) for original_note in original_notes: title = os.path.basename(os.path.splitext(original_note)[0]) with open(original_note, 'r') as f: text = f.read() content = markdown.markdown(text, extensions=[GithubFlavoredMarkdownExtension()]) fileinfo = os.stat(original_note) created = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(fileinfo.st_birthtime)) modified = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(fileinfo.st_mtime)) tags = extract_tags(original_note) parsed_note = Note(title, content, created, modified, tags) # Append to <en-export> element root_tag.append(create_note(parsed_note, soup)) # Append <en-export> to the empty XML document soup.append(root_tag) with open(out_file, 'w') as f: f.write(str(soup))
def add_tags(content, has_mathjax): # Add html, head tag html = bs(str(content), "html5lib") doctype = Doctype('html') html.insert(0, doctype) head = html.head # Add style tag head.append(html.new_tag('style', type='text/css')) # Applying font head.style.append( '*{font-family: Arial, Helvetica, sans-serif !important;}') # For rendering maths equation if has_mathjax: # Polyfill polyfill = html.new_tag( 'script', src="https://polyfill.io/v3/polyfill.min.js?features=es6") head.append(polyfill) # MathJax mathjax_actions = Path(__file__).parent.joinpath("mathjax-actions.js") head.append( html.new_tag('script', src=f'file:///{mathjax_actions.as_posix()}')) conf = html.new_tag('script', type="text/x-mathjax-config") conf.append("MathJax.Hub.Config({CommonHTML: {scale: 200}});") head.append(conf) head.append( html.new_tag( 'script', id="MathJax-script", attrs={'async': ''}, src= "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML" )) return html.prettify(), has_mathjax
def to_plist(self): def prettify(soup): return str(soup).replace("<array>", "\n <array>").replace("<dict>", "\n <dict>").replace("<key>", "\n <key>").replace("<string>", "\n <string>").replace("</dict>", "\n </dict>").replace("</array>", "\n </array>").replace("</plist>", "\n</plist>") soup = BeautifulSoup('', 'xml') doctype = Doctype('plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"') soup.append(doctype) soup.append(soup.new_tag('plist', version="1.0")) soup.plist.append(soup.new_tag('array')) for itm in self.phrases: dct = soup.new_tag('dict') phrase_key = soup.new_tag('key') phrase_key.string = "phrase" phrase_string = soup.new_tag('string') phrase_string.string = itm['phrase'] shortcut_key = soup.new_tag('key') shortcut_key.string = "shortcut" shortcut_string = soup.new_tag('string') shortcut_string.string = itm['shortcut'] dct.append(phrase_key) dct.append(phrase_string) dct.append(shortcut_key) dct.append(shortcut_string) soup.array.append(dct) return prettify(soup)
# soup = BeautifulSoup(markup, "lxml") # tag = soup.a # tag.string = "New link text." # print tag # # <a href="http://example.com/">New link text.</a> # # ///////// # # append() # # ///////// # soup = BeautifulSoup("<a>Foo</a>", "lxml") # soup.a.append("Bar") # print soup.prettify() # # <html><body><a>FooBar</a></body></html> # print soup.a.contents # # [u'Foo', u'Bar'] soup = BeautifulSoup( '<!doctype html><html lang="en"><head><meta charset="utf-8" /> <title>Your Book Title</title><link rel="stylesheet" href="style.css" type="text/css" /></head><body></body>', 'lxml') print soup.prettify() print soup.html print soup.body print soup.div print Doctype("html")
"https://banter-latte.com/2007/07/17/interviewing-leather-part-four/", "https://banter-latte.com/2007/07/24/interviewing-leather-part-five/", "https://banter-latte.com/2007/07/31/interviewing-leather-part-six/", "https://banter-latte.com/2007/08/07/interviewing-leather-part-seven/", "https://banter-latte.com/2007/08/14/interviewing-leather-part-eight/", "https://banter-latte.com/2007/08/21/interviewing-leather-part-nine/", "https://banter-latte.com/2007/08/28/interviewing-leather-part-ten/", "https://banter-latte.com/2007/09/04/interviewing-leather-part-eleven/", "https://banter-latte.com/2007/09/20/interviewing-leather-part-twelve/", "https://banter-latte.com/2007/09/25/interviewing-leather-part-thirteen/", "https://banter-latte.com/2007/10/02/interviewing-leather-part-fourteen/" ] # Construct HTML skeleton doc = BeautifulSoup() doc.append(Doctype('html')) html = doc.new_tag('html', lang='en-US') doc.append(html) head = doc.new_tag('head') html.append(head) meta = doc.new_tag('meta', charset='utf-8') head.append(meta) title = doc.new_tag('title') title.string = 'Interviewing Leather' head.append(title) body = doc.new_tag('body') html.append(body) # Gather each chapter's content for i, chapter in enumerate(chapters): # Construct h1 for the chapter
def main(input_base_path: str, output_base_path: str) -> None: # Process each .html file input_base_path = input_base_path + '/' original_files = glob.glob(input_base_path + '*.html') output_directory = Path(output_base_path) try: output_directory.mkdir() except: pass for _original_file_as_str in original_files: original_file = Path(_original_file_as_str) original_full_filename = input_base_path + original_file.name soup = load_soup_file(original_full_filename) # META charset # Delete any that exist. Put in a correct one. charset_metas = get_charset_metas(soup) for _cs in charset_metas: _cs.decompose() new_meta = soup.new_tag('meta') new_meta.attrs['http-equiv'] = "content-type" new_meta.attrs['content'] = "text/html; charset=UTF-8" newline = NavigableString('\n') soup.head.insert(0, new_meta) soup.head.insert(0, newline) # META Viewports # Leave alone if already one there. Otherwise, put in a correct one. viewport_metas = get_viewport_metas(soup) if not viewport_metas: new_meta = soup.new_tag('meta') new_meta.attrs['name'] = 'viewport' new_meta.attrs['content'] = "width=device-width, initial-scale=1.0" newline = NavigableString('\n') soup.head.insert(0, new_meta) soup.head.insert(0, newline) # META robots # If one exists, leave it alone. Otherwise add one robots = soup.find_all('meta', {'name': "robots"}) if not robots: new_meta = soup.new_tag('meta') new_meta.attrs['name'] = "robots" new_meta.attrs['content'] = "index,follow" newline = NavigableString('\n') soup.head.insert(0, new_meta) soup.head.insert(0, newline) # Delete: <meta content="OpenOffice.org 3.3 (Win32)" name="GENERATOR"/> open_office = soup.find_all('meta', { 'content': "OpenOffice.org 3.3 (Win32)", 'name': "GENERATOR" }) for tag in open_office: tag.decompose() # Delete any empty titles for tag in soup.find_all('title'): if not tag.contents: tag.decompose() # Remove any <font></font> that is identical to its parent for tag in soup.find_all('font'): if tag.parent.name == tag.name: if tag.parent.attrs.keys() == tag.attrs.keys(): tag.parent.unwrap() # Remove any <big></big> that is identical to its parent for tag in soup.find_all('big'): if tag.parent.name == tag.name: if tag.parent.attrs.keys() == tag.attrs.keys(): tag.parent.unwrap() # Modify freepages links to local if possible. # Is it from rootsweb? Does it have the same filename locally? for tag in soup.find_all('a'): try: parsed = urlparse(tag['href']) except: print(f"Strange tag: {tag}\n") continue if parsed.netloc != 'freepages.genealogy.rootsweb.ancestry.com': continue filename = Path(parsed.path).name full_filename = Path(input_base_path + filename) if full_filename.is_file(): # Everything is OK to change link. tag['href'] = filename # Clean spaces inside of <a> </a> strings for tag in soup.find_all('a'): if len(tag.contents) == 1: tag.string = html_text_spaces_clean(tag.string) # Add lang="en" to <html> for tag in soup.find_all('html'): tag['lang'] = 'en' # Remove: <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> for item in soup.contents: if isinstance(item, Doctype): item.extract() # Add <!DOCTYPE html> to top of file tag = Doctype('html') soup.insert(0, tag) # Remove any added space just below doctype html if soup.contents[1].string == '\n': soup.contents[1].extract() # remove spaces around text several tags: for tag in soup.find_all(['title', 'h1', 'h2', 'h3', 'h4']): if len(tag.contents) == 1: tag.string = html_text_spaces_clean(tag.string) # <head> reduce blank lines for tag in soup.head.contents[1:]: if tag.string == '\n' and tag.previous_sibling.string == '\n': tag.extract() # <head> indent each tag inside 4 spaces # Note the two passes. Its required because inserting in .contents # while looping will cause an infinite loop. head_tags = list() for tag in soup.head.contents: if tag.string == '\n': continue head_tags.append(tag) for tag in head_tags: indent = NavigableString(' ') tag.insert_before(indent) # Add a newline after </head> for .html readability newline = NavigableString('\n') soup.head.insert_after(newline) # fix html checker: # "The type attribute for the style element is not needed and should be omitted."" for tag in soup.find_all('style'): del tag['type'] ## Table Cleaning # <td> </td> try to keep to one line for tag in soup.find_all('td'): if len(tag.contents) > 1: if tag.contents[-1].string == '\n': tag.contents[-1].extract() if len(tag.contents) > 1: if tag.contents[0].string == '\n': tag.contents[0].extract() print("At end") #embed() # ! Remember to use soup.prettify() ONLY for visuals. Always write as str(soup) # ! Otherwise whitespace and other formatting is gone. output_file = Path(os.path.join(output_directory, original_file.name)) with open(output_file, 'w') as fp: fp.write(str(soup))
def main(input_base_path:str, output_base_path:str) -> None: # Process each .html file input_base_path = input_base_path + '/' original_files = glob.glob(input_base_path + '*.html') output_directory = Path(output_base_path) try: output_directory.mkdir() except: pass for _original_file_as_str in original_files: original_file = Path(_original_file_as_str) original_full_filename = input_base_path + original_file.name soup = load_soup_file(original_full_filename) # META charset # Delete any that exist. Put in a correct one. charset_metas = get_charset_metas(soup) for _cs in charset_metas: _cs.decompose() new_meta = soup.new_tag('meta') new_meta.attrs['http-equiv'] = "content-type" new_meta.attrs['content'] = "text/html; charset=UTF-8" newline = NavigableString('\n') soup.head.insert(0, new_meta) soup.head.insert(0, newline) # META Viewports # Leave alone if already one there. Otherwise, put in a correct one. viewport_metas = get_viewport_metas(soup) if not viewport_metas: new_meta = soup.new_tag('meta') new_meta.attrs['name'] = 'viewport' new_meta.attrs['content'] = "width=device-width, initial-scale=1.0" newline = NavigableString('\n') soup.head.insert(0, new_meta) soup.head.insert(0, newline) # Delete: <meta content="OpenOffice.org 3.3 (Win32)" name="GENERATOR"/> open_office = soup.find_all('meta', { 'content':"OpenOffice.org 3.3 (Win32)", 'name': "GENERATOR" }) for tag in open_office: tag.decompose() # Add lang="en" to <html> for tag in soup.find_all('html'): tag['lang'] = 'en' # Remove: <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> for item in soup.contents: if isinstance(item, Doctype): item.extract() # Add <!DOCTYPE html> to top of file tag = Doctype('html') soup.insert(0, tag) # Remove any added space just below doctype html if soup.contents[1].string == '\n': soup.contents[1].extract() # Run through tidy html, errors = tidy_document( str(soup), options= { "indent": 1, # Pretty; not too much of a performance hit "tidy-mark": 0, # No tidy meta tag in output "doctype": 'html5', "drop-empty-elements": 0, "drop-empty-paras": 0, "add-meta-charset": 1, "logical-emphasis": 1, "preserve-entities": 1, "literal-attributes": 1, "priority-attributes": "name,content,rel,href", "wrap": 80 }) # embed() output_file = Path(os.path.join(output_directory, original_file.name)) with open(output_file, 'w') as fp: fp.write(html)
from sys import argv as args,exit from bs4 import BeautifulSoup,Doctype if __name__ == "__main__": if not len(args) == 3: exit(1) with open(args[1],'r') as inputFile: soup = BeautifulSoup(inputFile,"lxml") html = BeautifulSoup("","html5lib") html.contents.insert(0,Doctype("html")) html.contents[1]["lang"]="ja" meta = html.new_tag("meta") meta["charset"]="UTF-8" html.contents[1].head.contents.append(meta) script = html.new_tag("script") script["type"]="text/javascript" script["src"]="js/functions.js" html.contents[1].head.contents.append(script) svg = soup.find("svg") html.contents[1].body.contents.append(svg) nodes = svg.find_all("g",class_="node") for node in nodes: node["onclick"]="onClick(this);" with open(args[2],'w') as outputFile: outputFile.write(str(html)+"\n")
def process_chapter(chapter: str) -> BeautifulSoup: # Trim navpanels while NAVPANEL_START in chapter: chapter = delete_delimited_chunk(chapter, NAVPANEL_START, NAVPANEL_END) # We want to remove the sub-chapter links unless it's the intro page if IRBOOK_MARKER not in chapter: while CHILD_LINKS_START in chapter: chapter = delete_delimited_chunk(chapter, CHILD_LINKS_START, CHILD_LINKS_END) chapter_soup = soups(chapter) # Delete elements that epub doesn't like for el in chapter_soup.find_all('meta'): el.decompose() # Delete the "autogenerated page" footer address = chapter_soup.find('address') if address: address.decompose() # The link 404's anyways css_link = chapter_soup.find( 'link', href='https://nlp.stanford.edu/IR-book/html/htmledition/irbook.css') if css_link: css_link.decompose() # Who puts a <br> in a header??! h1 = chapter_soup.find('h1') if h1: br = h1.find('br') if br: br.decompose() for s in chapter_soup.find_all(text=re.compile("[`']")): if type(s) in [ Comment, bs4.Doctype, bs4.ProcessingInstruction, bs4.Declaration ]: continue old_s = str(s) new_s = (old_s.replace('``', '“').replace("''", '”').replace( "'", '’').replace("`", '‘')) s.replace_with(new_s) for img in chapter_soup.find_all('img'): alt = img['alt'] if '...' in alt: # they literally deleted half the source i'd need to correctly # reproduce the larger figures... alt = expand_ellipsized(img) if alt is None: continue if alt.endswith('.html'): # wtf are you doing continue if r'\includegraphics' in alt: continue # cross-reference symbol if alt == '[*]': img.parent.string = '‡' continue mathml = trivial_tex_to_mathml(alt) if mathml is None: # Otherwise, give SnuggleTeX a try: try: mathml = tex_to_mathml(alt) except TeXRenderError as e: e.context = img.parent raise img.replace_with(soups(mathml, 'html.parser')) # delete empty children at the end of the docment annoying_tag_names = ['hr', 'p', 'br'] empty_tags = ['hr', 'br'] children = list(chapter_soup.body.find_all(True)) if children: for last_child in reversed(children): if last_child.name in annoying_tag_names: if last_child.name in empty_tags: last_child.decompose() continue if last_child.string and last_child.string.strip(): break else: last_child.decompose() else: break # this prevents a file from being invalid xhtml lol for bad_a in chapter_soup.find_all('a'): if bad_a.has_attr('wikipedia:general'): del bad_a['wikipedia:general'] break # # We're renaming everything to .xhtml # for el in itertools.chain(chapter_soup.find_all('link'), # chapter_soup.find_all('a')): # if el.has_attr('href') and not el['href'].startswith('http'): # el['href'] = el['href'].replace('.html', '.xhtml') # Get rid of naughty attributes for el in chapter_soup.find_all(True): el['class'] = '' for attr in [ 'align', 'valign', 'cellpadding', 'border', 'nowrap', 'compact' ]: if el.has_attr(attr): el['class'] += attr + '-' + el[attr].lower() del el[attr] if el.has_attr('width'): if el['width'] == '100%': el['class'] += ' full-width' else: el['class'] += ' width-' + el['width'] del el['width'] if not el['class']: del el['class'] for el in chapter_soup.find_all('br'): if el.has_attr('clear'): del el['clear'] for el in chapter_soup.find_all('tt'): el.name = 'code' chapter_soup.html.head.append(chapter_soup.new_tag('meta', charset='utf-8')) chapter_soup.html.head.append( chapter_soup.new_tag( 'link', rel='stylesheet', type='text/css', href='Styles/book.css', )) first_child = next(chapter_soup.children) if isinstance(first_child, Doctype): # XHTML has no doctype, and Doctype has no decompose method # first_child.replace_with(soups('<!DOCTYPE html>')) first_child.replace_with(Doctype('html')) for el in chapter_soup.children: if isinstance(el, Comment) and el.startswith(CONVERSION_COMMENT_MARKER): el.replace_with('') break # chapter_soup.find('html')['xmlns:epub'] = 'http://www.idpf.org/2007/ops' # chapter_soup.smooth() return chapter_soup
def clean_text(self): text = self.cleaned_data["text"] timeout = self.data["timeout"] javascript = """ function fullscreen_display() { var images = document.getElementsByTagName('img'); for (i = 0; i < images.length;i++ ) { images[i].style.display = "none"; } var divs = document.getElementsByTagName('div'); for (i = 0; i < divs.length;i++ ) { var images = divs[i].getElementsByTagName('img'); for (j = 0; j < images.length;j++ ) { images[j].style.display = "block"; images[j].style.marginLeft = "auto"; images[j].style.marginRight = "auto"; } divs[i].style.display = "block"; } } function launchIntoFullscreen(element) { if(element.requestFullscreen) { element.requestFullscreen(); } else if(element.mozRequestFullScreen) { element.mozRequestFullScreen(); } else if(element.webkitRequestFullscreen) { element.webkitRequestFullscreen(); } else if(element.msRequestFullscreen) { element.msRequestFullscreen(); } fullscreen_display(); window.setTimeout(function() { location.href = "FIXMEURL"; }, %s); } """ % (int(timeout) * 1000) soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) if "style" in soup.body.attrs: soup.html.attrs["style"] = soup.body.attrs["style"] script = soup.new_tag("script") script.attrs["type"] = "text/javascript" script.append(javascript) jquery = soup.new_tag("script") jquery.attrs["type"] = "text/javascript" jquery.attrs["src"] = "https://code.jquery.com/jquery.min.js" jqlist = soup.head.findAll( "script", {"src": "https://code.jquery.com/jquery.min.js"}) if not jqlist: soup.head.append(jquery) for div in soup.body.findAll("div"): if "style" in div.attrs: div.attrs["style"] = "%s; %s;" % (div.attrs["style"], "display:none") else: div.attrs["style"] = "display:none" scriptlist = soup.head.findAll("script", string="launchIntoFullscreen") if not scriptlist: soup.head.append(script) if not soup.body.findAll("img"): raise ValidationError(_("The form must have at least one image")) imgid = 0 for img in soup.findAll("img"): img["onclick"] = "javascript:launchIntoFullscreen(document.documentElement);" img["id"] = "img%s" % imgid imgid += 1 return str(soup)
def html_makepage(self, plot_title=None, plot_notes=None): """ Generate HTML document from scratch with plot image and store it in self.html_page - plot_title: (string) alternative title for the plot - plot_notes: (list of string) optional text to add below plot_title """ # Path to image file of the plot # Use SVG file for better scaling quality try: img_source_path = self.output_path['svg'] except AttributeError as err: errmsg = f"Path to plot render for HTML page not found. Method self.set_output_paths() not called yet." error_exit(self.log, errmsg) # Main titles page_title = self.title if self.datelim and plot_title is None: plot_title = "Graph from {} to {}".format(*self.datelim) # Head and title of the HTML page head = "<head><meta /><title>{}</title></head>".format(page_title) page = BeautifulSoup(''.join(head), 'lxml') page.insert(0, Doctype('html')) page.html['lang'] = 'en' page.head.meta['charset'] = 'utf-8' # CSS style: take from file defined in configuration html_css_file = MainConf.get('reports', 'html_main_cssfile', fallback='html_main_style.html', mandatory=False) css_style = DataFile(html_css_file, mandatory=True).contents page.head.append(css_style.find('style')) self.log.debug(f"HTML page: added CSS style from file: {html_css_file}") # Body and main title newobj = page.html for tag in ['body', 'h1']: newtag = page.new_tag(tag) newobj.append(newtag) newobj = newobj.contents[-1] page.h1.string = page_title # Render plot in SVG format img_block = page.new_tag('div') img_block['class'] = 'blockcard' if plot_title is not None: img_block.append(page.new_tag('h2')) img_block.h2.string = plot_title self.log.debug("HTML page: plot sub-title added") if plot_notes is not None: if not isinstance(plot_notes, list): plot_notes = [plot_notes] for note in plot_notes: img_block.append(page.new_tag('p')) p_block = img_block.contents[-1] p_block.string = note self.log.debug("HTML page: %s notes added", len(plot_notes)) img_block.append(page.new_tag('img')) img_block.img['class'] = 'plotrender' img_block.img['src'] = img_source_path img_block.img['alt'] = self.title page.body.append(img_block) self.log.info("HTML page: plot render '%s' added to report page", img_block.img['src']) # Render container for tables tables_block = page.new_tag('div') tables_block['id'] = 'tablescontainer' tables_block['class'] = 'blockcard' page.body.append(tables_block) self.html_page = page