def sanitize_html(value): from BeautifulSoup import BeautifulSoup, Comment, Tag # FIXME: 'None' should never be saved as text if value is None: return "" # allowed tags for a Vodafone Live <CONTAINER type="data" /> # this doubles up as a translation table. CKEditor does new-ish # HTML than Vodafone Live will accept. We have to translate 'em' back # to 'i', and 'strong' back to 'b'. # # NOTE: Order is important since <strong>'s can be inside <p>'s. tags = ( ("em", "i"), # when creating them in the editor they're EMs ("strong", "b"), ("i", "i"), # when loading them as I's the editor leaves them ("b", "b"), # we keep them here to prevent them from being removed ("u", "u"), ("br", "br"), ("p", "p"), ) valid_tags = [tag for tag, replacement_tag in tags] soup = BeautifulSoup(value) # remove all comments from the HTML for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # hide all tags that aren't in the allowed list, but keep # their contents for tag in soup.findAll(True): # Vodafone Live allows for no tag attributes tag.attrs = [] if tag.name not in valid_tags: tag.hidden = True # replace tags with Vlive equivelants for element, replacement_element in tags: if element is not replacement_element: for tag in soup.findAll(element): replacement_tag = Tag(soup, replacement_element) replacement_tag.insert(0, tag.text) tag.replaceWith(replacement_tag) xml = soup.renderContents().decode("utf8") fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"}) return ( fragment.replace(" ", " ") .replace("’", "'") .replace("‘", "'") .replace(""", '"') .replace("“", '"') .replace("”", '"') .replace("•", "- ") .replace("é", "e") .replace("É", "E") .replace("–", "-") )
def do_iperimage(value): """detects iPernity static urls and creates clickable thumbnail for it""" soup = BeautifulSoup(value) iprl = re.compile("^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$") iprl_thumb = "500" iprl_zoom = "560" for img in soup.findAll("img", src=iprl): match = iprl.match(img["src"]) try: thumb = Tag(soup, "img") thumb["alt"] = img["title"] thumb["src"] = match.group(1) + iprl_thumb + match.group(3) link = Tag(soup, "a") link["href"] = match.group(1) + iprl_zoom + match.group(3) link["rel"] = "lightbox" link["title"] = img["title"] link.insert(0, thumb) img.replaceWith(link) except: pass return unicode(soup)
def makeHTMLQuestion(fn, htmldata): soup = BeautifulSoup(htmldata) #add JS soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')" scripttag = Tag(soup, "script") scripttag['type'] = "text/javascript" scripttag.string = "__SUBMIT_JS__" soup.find('head').insert(0, scripttag) #replace forms forms = soup.findAll('form') if forms: for form in forms: if not form.has_key('method'): form['method'] = 'POST' if not form.has_key('action'): if testmode: form['action'] = 'https://workersandbox.mturk.com/mturk/externalSubmit' else: form['action'] = 'https://www.mturk.com/mturk/externalSubmit' if not form.has_key('onSubmit'): form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');" inputtag = Tag(soup,'input') inputtag['type'] = 'hidden' inputtag['name'] = 'assignmentId' inputtag['id'] = 'myAssignmentId' inputtag['value'] = '' form.insert(0, inputtag) html = str(soup).replace("__SUBMIT_JS__", SUBMIT_JS) mainurl = uploadfile(fn, html) for sub in soup.findAll('img'): # TODO fn = dirname(fn) + '/' + sub['src'] uploadfile(fn) return ExternalQuestion(escape(mainurl), frame_height)
def convertStoreFormat(self): """ convert legacy to canonical store format N.B.: While the new canonical store format was introduced in TiddlyWiki v2.2 final, various v2.2 beta releases are still using the legacy store format. @return: None """ try: version = self.getVersion() except (ValueError, AttributeError): version = (0, 0, 0) # assume pre-v2.2 format if version and (version[0] + (version[1] / 10.0) < 2.3): # N.B.: addition works because all pre-v2.3 releases are known -- XXX: actual threshold is v2.2 final for tiddler in self.store.findChildren("div", tiddler = True): # convert tiddler attribute to title attribute tiddler["title"] = tiddler["tiddler"] del(tiddler["tiddler"]) # decode tiddler contents tiddler.contents[0].replaceWith(decodeTiddlerText(tiddler.contents[0])) # XXX: use of contents[0] hacky? # add PRE wrapper pre = Tag(self.dom, "pre") pre.contents = tiddler.contents tiddler.contents = [pre]
def __init__(self): self.tagStack = [] self.currentData = '' self.currentTag = None self.pushTag(self) ContentHandler.__init__(self) Tag.__init__(self, '[document]')
def savePDF(self, pdf_filename, parent_soup, target_node, yes_phrase, url, key, school_name): if target_node: grandparent_node = target_node.parent.parent tag = self.highlightedNode(target_node, yes_phrase, parent_soup) self.replaceNode(target_node, tag) body = Tag(parent_soup,"body") body.append(grandparent_node) else: body = parent_soup try: weasyprint = HTML(string=body.prettify()) tmp_filename = 'pdfs/tmp.pdf' weasyprint.write_pdf(tmp_filename,stylesheets=[CSS(string='body { font-size: 10px; font-family: serif !important }')]) except: print "weasyprint failed on url: "+url if target_node: self.replaceNode(tag, target_node) #return to old state return if target_node: self.replaceNode(tag, target_node) #return to old state sep_filename = "pdfs/sep.pdf" self.makeSepPage(sep_filename, url, key, school_name) merger = PdfFileMerger() if (os.path.exists(pdf_filename)): merger.append(PdfFileReader(file(pdf_filename, 'rb'))) merger.append(PdfFileReader(file(sep_filename, 'rb'))) merger.append(PdfFileReader(file(tmp_filename, 'rb'))) merger.write(pdf_filename)
def clean_html_style(data, element, remove_comments=True, remove_empty=True): """removes the style information associated with html element >>> t = '<!-- /* Style Definitions */ table.MsoNormalTable {mso-style-name:"Table Normal"; mso-tstyle-rowband-size:0; mso-tstyle-colband-size:0; mso-style-noshow:yes; mso-style-priority:99; mso-style-qformat:yes; mso-style-parent:""; mso-padding-alt:0in 5.4pt 0in 5.4pt; mso-para-margin-top:0in; mso-para-margin-right:0in; mso-para-margin-bottom:10.0pt; mso-para-margin-left:0in; line-height:115%; mso-pagination:widow-orphan; font-size:11.0pt; font-family:"Calibri","sans-serif"; mso-ascii-font-family:Calibri; mso-ascii-theme-font:minor-latin; mso-hansi-font-family:Calibri; mso-hansi-theme-font:minor-latin;} --><p> </p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">?</p><p class="MsoNormal" style="margin-bottom: 0.0001pt; line-height: normal;">According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p') '<p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p', remove_empty=False) '<p> </p><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' >>> clean_html_style(t, 'p', remove_comments=False) '<!-- /* Style Definitions */ table.MsoNormalTable\t{mso-style-name:"Table Normal";\tmso-tstyle-rowband-size:0;\tmso-tstyle-colband-size:0;\tmso-style-noshow:yes;\tmso-style-priority:99;\tmso-style-qformat:yes;\tmso-style-parent:"";\tmso-padding-alt:0in 5.4pt 0in 5.4pt;\tmso-para-margin-top:0in;\tmso-para-margin-right:0in;\tmso-para-margin-bottom:10.0pt;\tmso-para-margin-left:0in;\tline-height:115%;\tmso-pagination:widow-orphan;\tfont-size:11.0pt;\tfont-family:"Calibri","sans-serif";\tmso-ascii-font-family:Calibri;\tmso-ascii-theme-font:minor-latin;\tmso-hansi-font-family:Calibri;\tmso-hansi-theme-font:minor-latin;} --><p>New Delhi, Aug. 21 -- <strong>Jonathan E. Rathbone, Matthew R., J. Jackson, Thomas C. Stoneberg and ujjaini mitra-shah</strong> of <strong>Wm. Wrigley Jr. Company, </strong>Chicago, U.S.A. have developed a food product container.</p><p>?</p><p>According to the Controller General of Patents, Designs & Trade Marks ?A food product container includes a base and a cover?</p>' """ try: soup = BeautifulSoup(data) except: soup = BeautifulSoup(data) # remove all comments in this html block if remove_comments: comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # remove all occurences of tags like sup, script [i.extract() for i in soup.findAll(re.compile('sup|script'))] # find all occurences of the "element" tag for i in soup.findAll(element): text = i.renderContents().strip() if text: new_tag = Tag(soup, element) new_tag.insert(0, text) i.replaceWith(new_tag) elif remove_empty: i.extract() return smart_unicode(soup.renderContents())
def _set_element(self, root, tagname, text=None, attr=None): """Creates if not available an element at the soup root element :return: tag object or None :rtype: Tag """ # Add Topic if not available if attr is None: if root.find(re.compile(tagname + "$", re.I)) is None: new_tag = Tag(self._soup, tagname) root.insert(0, new_tag) else: if root.find(re.compile(tagname + "$", re.I), attr) is None: new_tag = Tag(self._soup, tagname, attr.items()) root.insert(0, new_tag) settings = self._soup.find(self.root) tag = settings.find(re.compile(tagname + "$", re.I)) # Something to insert if tag is not None and text is not None: if tag.text.strip() == "": tag.insert(0, NavigableString(text)) else: tag.contents[0].replaceWith(text) return tag
def unTag(self, tag): """ recursively removes unwanted tags according to defined lists @param tag: tag hierarchy to work on """ for child in tag.findChildren(True, recursive=False): self.unTag(child) if (self.remove_classes_regexp != "") and (tag.has_key("class") and (re.match(self.remove_classes_regexp, tag["class"]) != None)): tag.extract() elif tag.name in self.keep_tags: new_tag = Tag(self.input, tag.name) new_tag.contents = tag.contents tag.replaceWith(new_tag) elif tag.name in self.remove_tags_keep_content: children = tag.findChildren(True, recursive=False) if len(children)==1: tag.replaceWith(children[0]) elif len(children) > 1: new_tag = Tag(self.input, "p") for child in tag.findChildren(True, recursive=False): new_tag.append(child) tag.replaceWith(new_tag) else: tag.replaceWith(tag.renderContents()) else: tag.extract()
def rewriteLinksSection(dom, soup, links_table): links = [] for fnt in links_table.findAll('font', {'size': '2', 'face':'verdana'}): if str(fnt).startswith('<font size="2" face="verdana"><a href="'): link = fnt.find('a') caption = link.getText('').strip() if caption.endswith(' Translation') and OMIT_TRANSLATIONS: continue links.append((link['href'], caption)) links_table.decompose() if not INCLUDE_LINKS or len(links) == 0: return b = Tag(soup, 'b') b.string = 'Links' dom.append(b) ul = Tag(soup, 'ul') for url, caption in links: li = Tag(soup, 'li') a = Tag(soup, 'a', {'href': url}) a.string = caption li.append(a) ul.append(li) dom.append(ul)
def _linkify_headings(self, soup): md_el = soup.find("div", "md") for heading in md_el.findAll(["h1", "h2", "h3"], recursive=False): heading_a = Tag(soup, "a", [("href", "#%s" % heading["id"])]) heading_a.contents = heading.contents heading.contents = [] heading.append(heading_a)
def get_last_3(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") ul = Tag(soup, "ul") for tr in table.findAll("tr"): td = tr.findAll("td") li = Tag(soup, "li") for el in td[3:]: if loop != 3: try: text = ''.join(el.findAll(text=True)) text = text.strip() if text != '' and text != ' ': el.name = "span" if loop != 2: el.append(' - ') li.append(el) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) return enclose
def do_iperimage(value): '''detects iPernity static urls and creates clickable thumbnail for it''' soup = BeautifulSoup(value) iprl = re.compile( '^(http://\w+\.ipernity\.com/\d+/\d+/\d+/\d+\.\w+\.)(75|100|240|500|560)(\.jpg)$' ) iprl_thumb = '500' iprl_zoom = '560' for img in soup.findAll('img', src=iprl): match = iprl.match(img['src']) try: thumb = Tag(soup, 'img') thumb['alt'] = img['title'] thumb['src'] = match.group(1) + iprl_thumb + match.group(3) link = Tag(soup, 'a') link['href'] = match.group(1) + iprl_zoom + match.group(3) link['rel'] = 'lightbox' link['title'] = img['title'] link.insert(0, thumb) img.replaceWith(link) except: pass return unicode(soup)
def outputData(self, outfile): outSoup = BeautifulStoneSoup("", selfClosingTags=["path"]) outRoot = Tag(outSoup, "svg") outRoot["xmlns"] = "http://www.w3.org/2000/svg" outRoot["width"] = self.width outRoot["height"] = self.height outRoot["version"] = 1.1 outSoup.insert(0, outRoot) for char in reversed(self._soup.findAll("char")): path = Tag(outSoup, "path") path["d"] = char["d"] path["style"] = self.style outRoot.insert(0, path) svg_header = "<?xml version=\"1.0\" standalone=\"no\"?>\n" svg_header += "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"" svg_header += " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n" self.scaleAndGridAlphabet(outSoup) outfile = open(outfile, "w") outfile.write(svg_header + outSoup.prettify()) outfile.close()
def make_links_readable(html): """ Goes through links making them readable If they are too long, they are turned into goo.gl links timing stats: before multiprocess = 0m18.063s """ soup = BeautifulSoup(html) for link in soup.findAll('a'):#links: oldlink = link if link and len(link.get('href', '')) > 90 and options.use_short_links: #make into goo.gl link short_link = shorten_link(soup, link) if short_link != None: link = short_link if validate_link(link) and link.get('href', None): if not link.text: oldlink.replaceWith(link.get('href', "No href link to replace with")) else: div = Tag(soup, 'div') div.setString(link.text) br = Tag(soup, 'br') new_link = Tag(soup, 'a') new_link.setString("(%s)" % (link.get('href')) ) div.append(br) div.append(new_link) oldlink.replaceWith(div) print return soup
def _linkify_headings(self, soup): md_el = soup.find('div', 'md') for heading in md_el.findAll(['h1', 'h2', 'h3'], recursive=False): heading_a = Tag(soup, "a", [('href', '#%s' % heading['id'])]) heading_a.contents = heading.contents heading.contents = [] heading.append(heading_a)
def format_title_link(title, title_link): """Format the title header""" soup = BeautifulSoup('') tag = Tag(soup, 'a') tag['href'] = 'http://en.wikipedia.org/wiki/%s' % title_link tag.string = title return str(tag)
def soup_filter_zz_fold_etymology( self, content ): heads = content.findAll( 'h2', {'class':'head'} ) + content.findAll( 'h3', {'class':'head'} ) + content.findAll( 'h4', {'class':'head'} ) etymologys = [] for h in heads: # print "Head, ", h if h.next and h.next.lower().startswith('etymology'): # print "found", h.content[0] etymologys.append( h ) # print 'Etymology found: ', h etymology_index = 1 for e in etymologys: div = Tag( content, 'div' ) div['id'] = u'etymology_'+str(etymology_index) div['style'] = u'display:none' linkSoup = BeautifulSoup( u''' <a href="javascript:f('%s',this)">[show]</a>''' % (div['id']) ) e.append( linkSoup ) paragraphs = [] n = e.nextSibling first = 1 while n and (n.__class__.__name__ == 'NavigableString' or (n.__dict__.has_key('name') and n.name == 'p') ): paragraphs.append( n ) n = n.nextSibling [div.append(p) for p in paragraphs] eIndex = e.parent.contents.index( e ) e.parent.insert( eIndex + 1, div ) etymology_index = etymology_index + 1
def save(): json_data = request.json status = False data={} with open(ret_url(json_data["doc"],"/papers"), "r+") as inf: txt = inf.read() soup = BeautifulSoup(txt) #Solo se e' una review faccio queste modifiche, altrimenti se e' una decisione lo inserisce direttamente nell'head if json_data["type"] == "review": #Controllo se lo script esiste o meno, se esiste lo elimino for script in soup.findAll("script",{"type":"application/ld+json"}): data = json.loads(script.text.strip()) if data[0]["@type"] == "review": if data[0]["article"]["eval"]["author"] == "mailto:"+json_data["author"]: script.extract() break #Rimuovo il contenuto del Body e lo riscrivo for section in soup.findAll("section"): section.extract() for section in json_data["sections"]: beauty = BeautifulSoup(section) soup.body.insert(len(soup.body.contents), beauty) #Creo lo script e lo inserisco new = Tag(soup, "script") new.attrs.append(("type", "application/ld+json")) new.string = json.dumps(json_data["script"]) soup.head.insert(len(soup.head.contents), new) #Salvo il file html = soup.prettify("utf_8") inf.seek(0) inf.write(html) inf.truncate() inf.close() status=True return jsonify({"result": status})
def fix_heading(heading, tags): ''' Remove paragraphs with no strings. Remove non-special headings that don't start with a paragraph. Remove lists from non-special headings. ''' SPECIAL = ['Books', 'Works', 'Bibliography', 'External links', 'Further reading'] tags = [tag for tag in tags if tag is not None and tag.name!='p' or tag.renderContents(None).strip()] special = False heading_text = tagtext(heading) for word in SPECIAL: if word.lower() in heading_text.lower(): special = True if heading_text == 'External links and references': set_heading_text(heading, 'External links') # Shorten lists (even special ones). # The motivation is that some pages like to list reams of crap, # usually in bibliographies, but in other things too. found_lis = 0 MAX_ITEMS = 10 # per headed section for tag in list(tags): if tag.name in ('ul', 'ol'): for li in tag.findAll('li', recursive=False): found_lis += 1 if found_lis > MAX_ITEMS: li.extract() # Remove any now-empty uls and ols. # Harder than it sounds, due to nested lists. temp = Tag(soup, 'p') for tag in tags: temp.append(tag) for tag in temp.findAll(('ul', 'ol')): if not tag.findAll(('ul', 'ol', 'li')): tag.extract() tags = temp.contents if found_lis > MAX_ITEMS: # Add " (some omitted)" to heading if heading_text: heading_text = heading_text.replace(' (incomplete)', '') if context['srcurl'].startswith('http:'): heading_text += ' (some <a href="%s">omitted</a>)' % context['srcurl'] else: heading_text += ' (some omitted)' # no "relative" links set_heading_text(heading, heading_text) if not special: if heading is not None: # Remove non-special headings which don't start with a paragraph. if not tags or tags[0].name != 'p': return drop_heading(heading) # Remove non-special headings containing lists. for tag in tags: if tag.name in ('ul', 'ol'): return drop_heading(heading) else: # Remove lists from None (before first heading, if any). tags = [tag for tag in tags if tag.name not in ('ul', 'ol')] return (heading, tags)
def _number_sections(self, soup): count = 1 for para in soup.find("div", "md").findAll(["p"], recursive=False): a = Tag(soup, "a", [("class", "p-anchor"), ("id", "p_%d" % count), ("href", "#p_%d" % count)]) a.append(str(count)) para.insert(0, a) para.insert(1, " ") count += 1
def _tag_generator(soup, name, attrs=[], contents=None): if attrs != []: new_tag = Tag(soup, name, attrs) else: new_tag = Tag(soup, name) if contents != None: new_tag.insert(0, contents) return new_tag
def code_colorizer(entry): """ Uses BeautifulSoup to find and parse the code in the entry that will be colorized and changes it according to the syntax specs using pygments. The HTML code should include the colorized code wrapped into a div which has language (e.g. python) as id and "code" as class attributes. Best part of using a filter is that we don't have to change the real post containing the code. The worst part is that we have to search for the code layer in each post. """ if settings.COLORIZE_CODE: try: from BeautifulSoup import BeautifulSoup, Tag from pygments import highlight from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter except ImportError: return entry try: parser = BeautifulSoup(entry, convertEntities=BeautifulSoup.ALL_ENTITIES) except HTMLParser.HTMLParseError: return entry # searching for code blocks in the blog entry code_blocks = parser.findAll("div", attrs={"class": "code"}) if len(code_blocks) > 0: for block in code_blocks: # if the code block's wrapper div doesn't have an id # attribute don't colorize the code if "id" in block.attrMap: language = block.attrMap["id"] else: continue # finding the exact place of the code layer = block.div if block.div else block # removing any html tags inside the code block [tag.extract() for tag in layer.findAll()] # getting the original code in the block code = "".join(layer.contents) # colorizing the code lexer = get_lexer_by_name(language) formatter = HtmlFormatter(linenos="table", style="tango", cssclass="code") colorized_code = Tag(parser, "div") if block.div else Tag(parser, "div", attrs=(("id", language), ("class", "code"))) colorized_code.insert(0, highlight(code, lexer, formatter)) layer.replaceWith(colorized_code) return parser.renderContents() return entry
def neighborhood_kml(request,neighborhood): neighborhood = Neighborhood.objects.get(name=neighborhood) soup = BeautifulSoup(neighborhood.geom.kml) tag = Tag(soup, "extrude") soup.polygon.insert(0, tag ) text = "1" tag.insert(0, text) xml = str(soup ) return render_to_response("restaurants/kml_template.html",{'neighborhood': neighborhood,"xml": xml}, context_instance=RequestContext(request))
def linearize_cols_3(soup, table): if table.get('id') == "linearize-cols-3": div = Tag(soup, "ul") div["class"] = "div-container" ul_last = get_last_3(soup, table) ul_first = get_first_3(soup, table) div.append(ul_first) div.append(ul_last) table.replaceWith(div)
def linearize_cols_2(soup, table): if table.get('id') == "linearize-cols-2": ul = Tag(soup, "ul") ul["class"] = "ul-container" ul_last = get_last_two(soup, table) ul_first = get_first_two(soup, table) ul.append(ul_first) ul.append(ul_last) table.replaceWith(ul)
def shorten_link(soup, link): api = googl.Googl(API_KEY) googl_link = api.shorten(link.get('href')) new_link = Tag(soup, 'a') new_link['href'] = googl_link.get('id', None) if new_link.get('href', None): new_link.setString(link.text) return new_link else: return None
def CreateSidebar(self, tag): '''Создаем sidebar''' h3 = Tag(self.soup, 'h3') h3.string = self.textShortCap tag.append(h3) if self._Probability(20): tag.append(self.CreateParagraph()) if self._Probability(90): tag.append(self.CreateList(0)) else: tag.append(self.CreateSelect())
def replaceJavascript(base_url, soup): for js in soup.findAll('script', {'src': re.compile('.+')}): try: real_js = get_content(resolve_path(base_url, js['src'])) real_js = real_js.replace('</', 'u003c/') js_tag = Tag(soup, 'script') js_tag.insert(0, NavigableString(real_js)) js.replaceWith(js_tag) except Exception,e: print 'failed to load javascript from %s' % js['src'] print e
def createParentUlTag(targetSoup): parentUlTag = Tag(targetSoup, 'ul', attrs={'class' : 'xbreadcrumbs', 'id' : 'breadcrumbs'}) topListTag = Tag(targetSoup, 'li') topAnchorTag = Tag(targetSoup, 'a', attrs={'href' : SITE_DOMAIN}) topAnchorTag.append(NavigableString('TOP')) topListTag.append(topAnchorTag) parentUlTag.append(topListTag) return parentUlTag
def FixTableHeadings(self): '''Fixes the doxygen table headings. This includes: - Using bare <h2> title row instead of row embedded in <tr><td> in table - Putting the "name" attribute into the "id" attribute of the <tr> tag. - Splitting up tables into multiple separate tables if a table heading appears in the middle of a table. For example, this html: <table> <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> ... </table> would be converted to this: <h2>Data Fields List</h2> <table> ... </table> ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: #tag['id'] = tag.td.h2.a['name'] tag.string = tag.td.h2.a.next tag.name = 'h2' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: print "Header tag: %s is %s" % (tag.name, tag.string.strip()) # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': print "Splitting Table named %s" % tag.string.strip() table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) for index, row in enumerate(table.contents[tag_index:]): new_table.insert(index, row) # Now move the <h2> tag to be in front of the <table> tag assert tag.parent.name == 'table' table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) table_parent.insert(table_index, tag)
def replace_courier(soup): """Lacking a better option, I use courier font to mark <code> within tinyMCE. And I want to turn that into real code tags. Most users won't be needing this(?), so this code is not called anywhere but kept for reference """ for t in soup.findAll(lambda s:s.has_key('style') and 'courier' in s['style']): tag = Tag(soup, 'code') while t.contents: tag.append(t.contents[0]) t.replaceWith(tag)
def _number_sections(self, soup): count = 1 for para in soup.find('div', 'md').findAll(['p'], recursive=False): a = Tag(soup, 'a', [ ('class', 'p-anchor'), ('id', 'p_%d' % count), ('href', '#p_%d' % count), ]) a.append(str(count)) para.insert(0, a) para.insert(1, ' ') count += 1
def FixTableHeadings(self): '''Fixes the doxygen table headings. This includes: - Using bare <h2> title row instead of row embedded in <tr><td> in table - Putting the "name" attribute into the "id" attribute of the <tr> tag. - Splitting up tables into multiple separate tables if a table heading appears in the middle of a table. For example, this html: <table> <tr><td colspan="2"><h2><a name="pub-attribs"></a> Data Fields List</h2></td></tr> ... </table> would be converted to this: <h2>Data Fields List</h2> <table> ... </table> ''' table_headers = [] for tag in self.soup.findAll('tr'): if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']: #tag['id'] = tag.td.h2.a['name'] tag.string = tag.td.h2.a.next tag.name = 'h2' table_headers.append(tag) # reverse the list so that earlier tags don't delete later tags table_headers.reverse() # Split up tables that have multiple table header (th) rows for tag in table_headers: print("Header tag: %s is %s" % (tag.name, tag.string.strip())) # Is this a heading in the middle of a table? if tag.findPreviousSibling('tr') and tag.parent.name == 'table': print("Splitting Table named %s" % tag.string.strip()) table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) new_table = Tag(self.soup, name='table', attrs=table.attrs) table_parent.insert(table_index + 1, new_table) tag_index = table.contents.index(tag) for index, row in enumerate(table.contents[tag_index:]): new_table.insert(index, row) # Now move the <h2> tag to be in front of the <table> tag assert tag.parent.name == 'table' table = tag.parent table_parent = table.parent table_index = table_parent.contents.index(table) table_parent.insert(table_index, tag)
def parse_content(self, content, attachments, tags): soup = BeautifulSoup(content) pattern = re.compile(r'<.*?src="\?hash=(\w+?)".*?>') # images for match in soup.findAll('img'): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: importedname = self.import_file(filename) match.replaceWith(Tag(soup, 'img', [('src', importedname)])) # pdfs for match in soup.findAll('embed', {"type": "evernote/x-pdf"}): filehashmatch = pattern.search(str(match)) if filehashmatch: filehash = filehashmatch.group(1) filename = next((l['filename'] for l in attachments if l['hash'] == filehash), None) if filename is not None: # convert pdf -> image images = pdf2image(filename) # import each jpg imageTags = Tag(soup, "span") for image in images: importedname = self.import_file(image) # add new image tag imageTags.insert(images.index(image), Tag(soup, 'img', [('src', importedname)])) # replace embed with <img src...> for each image match.replaceWith(imageTags) # TODO: audio # TODO: video #plugins # TODO: qa-format as in Supermemo #for match in soup.find(string=re.compile("A:")): # match['class'] = match.get('class', []) + ['Evernote2Anki-Highlight'] return str(soup).decode('utf-8')
def replace_courier(soup): """Lacking a better option, I use courier font to mark <code> within tinyMCE. And I want to turn that into real code tags. Most users won't be needing this(?), so this code is not called anywhere but kept for reference """ for t in soup.findAll(lambda s: ('style' in s) and 'courier' in s['style']): tag = Tag(soup, 'code') while t.contents: tag.append(t.contents[0]) t.replaceWith(tag)
def generate_heatmap(intensities): # Load the SVG map svg = open('counties.svg', 'r').read() # Load into Beautiful Soup soup = BeautifulSoup(svg, selfClosingTags=['defs', 'sodipodi:namedview']) # Find counties paths = soup.findAll('path') colors = [ "#DEEBF7", "#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#08519C", "#08306B" ] min_value = min(intensities.values()) max_value = max(intensities.values()) scalefactor = (len(colors) - 1) / (log(max_value + 1) - log(min_value + 1)) # County style path_style = 'font-size:12px;fill-rule:nonzero;stroke:#FFFFFF;stroke-opacity:1;stroke-width:0.1;stroke-miterlimit:4;stroke-dasharray:none;stroke-linecap:butt;marker-start:none;stroke-linejoin:bevel;fill:' # we will append this hover tooltip after each county path hover_text = '''<text id="popup-%s" x="%s" y="%s" font-size="10" fill="black" visibility="hidden">%s (%s)<set attributeName="visibility" from="hidden" to="visible" begin="%s.mouseover" end="%s.mouseout"/></text>''' for p in paths: if p['id'] not in ["State_Lines", "separator"]: try: count = intensities[p['id']] except: count = 0 x, y = (p['d'].split()[1]).split(',') # insert a new text tag for the county hover tooltip... p.parent.insert(0, Tag(soup, 'text', [("id", 'popup-' + p['id'])])) hover = soup.find("text", {"id": 'popup-' + p['id']}) hover.insert(1, "%s (%s)" % (p['inkscape:label'], str(count))) # add attributes to that text tag... hover['x'] = 250 hover['y'] = 20 hover['font-size'] = "20" hover['fill'] = "black" hover['visibility'] = "hidden" hover.insert(0, Tag(soup, 'set', [("begin", p['id'] + '.mouseover')])) set_tag = soup.find("set", {"begin": p['id'] + '.mouseover'}) set_tag['attributeName'] = "visibility" set_tag['from'] = "hidden" set_tag['to'] = "visible" set_tag['end'] = p['id'] + '.mouseout' color_class = min(int(scalefactor * log(count + 1)), len(colors) - 1) # color_class = int((float(len(colors)-1) * float(count - min_value)) / float(max_value - min_value)) # if count > 0: # print color_class color = colors[color_class] p['style'] = path_style + color print soup.prettify()
def linearize_rows_1(soup, table): if table.get('id') == "linearize-rows-1": div = Tag(soup, "div") div["class"] = "center" for tr in table.findAll("tr"): lista = tr.findAll("td") for td in lista: for p in td.findAll("p"): p.name = "span" td.name = "span" if td == lista[-1]: td = BeautifulSoup(td.prettify()) else: td = BeautifulSoup(td.prettify() + '<span> | </span>') div.append(td) table.replaceWith(div)
def makeHTMLQuestion(fn, htmldata): soup = BeautifulSoup(htmldata) #add JS soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')" soup.find('head').insert(0, SUBMIT_JS) #replace forms forms = soup.findAll('form') if forms: for form in forms: if not form.has_key('method'): form['method'] = 'POST' if not form.has_key('action'): if testmode: form[ 'action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit' else: form[ 'action'] = 'http://www.mturk.com/mturk/externalSubmit' if not form.has_key('onSubmit'): form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');" inputtag = Tag(soup, 'input') inputtag['type'] = 'hidden' inputtag['name'] = 'assignmentId' inputtag['id'] = 'myAssignmentId' inputtag['value'] = '' form.insert(0, inputtag) mainurl = uploadfile(fn, str(soup)) for sub in soup.findAll('img'): # TODO fn = dirname(fn) + '/' + sub['src'] uploadfile(fn) return ExternalQuestion(escape(mainurl), frame_height)
def content_absolute_links(content, image=None): from django.contrib.sites.models import Site current_site = Site.objects.get(pk=settings.SITE_ID) def abs_url(url): parsed = urlparse.urlparse(url) if parsed.netloc == parsed.scheme == '': url = urlparse.urljoin('http://{0}'.format(current_site.domain), url) return url soup = BeautifulSoup(content) if image: img = Tag(soup, 'img', [('src', image)]) soup.insert(0, img) for link in soup.findAll('a'): link['href'] = abs_url(link['href']) for link in soup.findAll('img'): link['src'] = abs_url(link['src']) return unicode(soup)
def get_mobile_content(self, obj): if obj.mobile_content: content = obj.mobile_content else: content = obj.content if not self.host_det: # apps only content = content.replace("\n<br />\n", "\n") elif self.host_det == "android": content = content.replace("\n<br />\n", "\n") soup = BeautifulSoup(content) # if soup.findAll('iframe'): # gh = soup.findAll('iframe')[0]['src'] # hh = soup.findAll('iframe') for p in soup.findAll("iframe"): if "youtube" in p['src']: newTag = Tag(soup, "a") newTag.attrs.append(("src", p.get('src'))) p.append(newTag) content = unicode(soup) if obj.source is not None and obj.source != '': content = content + "<p>Sources: " + obj.source.replace( "<p>", "").replace("</p>", "") + "</p>" else: content = content content = obj.get_modified_content(content, content_type='mobile') return content
def linearize_cols_1(soup, table): if table.get('id') == "linearize-cols-1": ul = Tag(soup, "ul") ul["class"] = "linearized" for td in table.findAll("td"): for p in td.findAll("p"): p.name = "span" try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "li" ul.append(td) except: pass table.replaceWith(ul)
def select_calendar(month=None, year=None): now = datetime.now() day = now.day cal = calendar.HTMLCalendar() cal.setfirstweekday(6) month_table = cal.formatmonth(year, month) soup = BeautifulSoup(month_table) outfile = open("myHTML.html", 'w') for data in soup.findAll('td'): if data['class'] != "noday": days = data.findAll(text=True) for oneday in days: day = NavigableString(oneday) oneday.extract() addatag = Tag(soup, 'input') addatag['type'] = "submit" addatag['name'] = "meetingday" addatag['value'] = day data.insert(0, addatag) outfile.write(soup.prettify()) outfile.close() infile = open("myHTML.html", 'r') calfile = "" for line in infile: calfile = calfile + line infile.close() return calfile
def _set(self, topic, key, value, topic_attr=None): """Set key and value at topic :return: success status :rtype: bool""" # In case it is an empty document if not unicode(self._soup).strip().startswith("<?xml"): self._soup.insert(0, NavigableString(self.HEADER)) # In case settings root is not defined settings = self._soup.find(self.root) if settings is None: self._soup.insert(1, Tag(self._soup, self.root)) settings = self._soup.find(self.root) # Add Topic topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr) if topic_tag is None: return False # Add key and value key_tag = self._set_element(topic_tag, key.lower(), escape(value)) # Add "" since XML may introduce whitespaces. #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value)) return key_tag is not None
def sanitize_story(self, story_content): soup = BeautifulSoup(story_content.strip()) fqdn = Site.objects.get_current().domain for iframe in soup("iframe"): url = dict(iframe.attrs).get('src', "") youtube_id = self.extract_youtube_id(url) if youtube_id: a = Tag(soup, 'a', [('href', url)]) img = Tag(soup, 'img', [('style', "display: block; 'background-image': \"url(https://%s/img/reader/youtube_play.png), url(http://img.youtube.com/vi/%s/0.jpg)\"" % (fqdn, youtube_id)), ('src', 'http://img.youtube.com/vi/%s/0.jpg' % youtube_id)]) a.insert(0, img) iframe.replaceWith(a) else: iframe.extract() return unicode(soup)
def initServerInfoBase(fileName): """ @description: Intializes soup for the Beautiful soup parser. Reads the exisitng Data from the fileName paramter. @todo:None @param xml: String, Name of file to be loaded in soup. @return: Boolean, True if a successful, else False """ if os.path.exists(fileName): try: f = open(fileName, "r") except: return None, False xml = f.read() f.close() soup = BeautifulStoneSoup(xml) serverinfolist = soup.findAll("serverinfo") else: serverinfolist = [] soup = BeautifulSoup() xml = "null" if len(serverinfolist) == 0: serverinfo = Tag(soup, "serverinfo") soup.insert(0, serverinfo) return soup, True
def add_noindex_to_a(text): doc = BeautifulSoup(text) host_orig = urlparse(settings.SITE_URL)[1] for a in doc.findAll('a'): try: host = urlparse(a['href'])[1] except: pass if a.findParent('noindex') == None: if host != host_orig: noindex = Tag(doc, "noindex") a.replaceWith(noindex) a['rel'] = 'nofollow' noindex.insert(0, a) return unicode(doc)
def CreateMetaKeywords(self): '''Создаем meta-тег keywords''' meta = Tag(self.soup, 'meta') meta['name'] = 'keywords' meta['content'] = '%s%s' % ( random.choice(self._GetFileLines('keywords1.txt')).strip(), random.choice(self._GetFileLines('keywords2.txt')).strip()) return meta
def CreateImage(self): '''Создаем img''' img = Tag(self.soup, 'img') img['src'] = self.urlImage img['alt'] = self.textShort.replace('|]', '||]') if self._Probability(30): img['title'] = self.textShort return img
def CreateListItem(self, liClass=''): '''Создаем li''' li = Tag(self.soup, 'li') if liClass != '': li['class'] = liClass else: self.AppendIds(li, 0, 50) return li
def startElement(self, name, attrs): #print("startElement", name, attrs, dir(attrs)) self.endData() tag = Tag(name, attrs.items(), self.currentTag, self.previous) if self.previous: self.previous.next = tag self.previous = tag self.pushTag(tag)
def linearize_states(soup, table): if table.get('id') == "linearize-states": ul = Tag(soup, "ul") ul["class"] = "text-level3" tag = None for tr in table.findAll("tr"): tr.name = "span" tr["class"] = "spaced" for td in tr.findAll("td"): if td["width"] == "40%": td.name = "li" tag = td else: tag.append(td) td.name = "ul" ul.append(tr) table.replaceWith(ul)
def get_list_for_key(name, children): """ Takes a key and a dictionary containing its children and recursively generates HTML lists items. Each item will contain the name and, if it has children, an unordered list containing those child items. """ li = Tag(SOUP, "li") li.append(NavigableString(name)) if children: ul = Tag(SOUP, "ul") for k, v in children.items(): ul.append(get_list_for_key(k, v)) li.append(ul) return li