def pygments_markdown(content): _lexer_names = reduce(lambda a,b: a + b[2], lexers.LEXERS.itervalues(), ()) _formatter = formatters.HtmlFormatter(cssclass='highlight') html = markdown.markdown(content) # Using html.parser to prevent bs4 adding <html> tag soup = BeautifulSoup(html) for tag in ("script", "html", "head", "title", "div", "hr", "article", "header", "footer"): if soup.findAll(tag): return escape(content) for pre in soup.findAll('pre'): if pre.code: txt = unicode(pre.code.text) lexer_name = "text" if txt.startswith(':::'): lexer_name, txt = txt.split('\n', 1) lexer_name = lexer_name.split(':::')[1] if lexer_name not in _lexer_names: lexer_name = "text" lexer = lexers.get_lexer_by_name(lexer_name, stripnl=True, encoding='UTF-8') if txt.find("<") != -1 or txt.find(">") != -1: txt = txt.replace("<", "<").replace(">", ">") if txt.find("&") != -1: txt = txt.replace("&", "&") highlighted = highlight(txt, lexer, _formatter) div_code = BeautifulSoup(highlighted).div if not div_code: return content pre.replaceWith(div_code) return unicode(soup)
def get(self): shareURL = urllib.unquote(self.request.get('u')) shareSelection = urllib.unquote(self.request.get('s')) if shareURL.startswith('https'): path = os.path.join(os.path.dirname(__file__), '../views/bookmarklet-exit.html') self.response.out.write(template.render(path,{'message':'Oops! This is a secure page :('})) shareURLParts = urlparse.urlparse(shareURL) if shareURLParts[2] != '': shareURLDir = re.search('(/.*)',shareURLParts[2]).group(0) else: shareURLDir = '' page = urlfetch.fetch(shareURL) pageSoup = BeautifulSoup(page.content) try: shareTitle = pageSoup.html.head.title.string except AttributeError: shareTitle = urllib.unquote(self.request.get('t')) pageImgs = pageSoup.findAll('img') for image in pageImgs: image['src'] = urlparse.urljoin(shareURL, image['src']) template_values = { 'url': shareURL, 'title': shareTitle, 'selection': shareSelection, 'images': pageImgs } # We get the template path then show it path = os.path.join(os.path.dirname(__file__), '../views/bookmarklet.html') self.response.out.write(template.render(path, template_values))
if len(sys.argv) != 2: print "should pass the inputfilename" sys.exit() for url in file(sys.argv[1]): # ignore comments url = url.strip() if url[0] == '#': continue #print "Processing: " + url html = urllib2.urlopen(url.strip()) soup = BeautifulSoup(html) res = soup.findAll('link', rel='alternate', attrs={'type': re.compile("^application/(atom|rss)\+xml")}) if len(res) == 0: #print "Couldn't find the Feed!" continue href = res[0]['href'] # relative link? if not href.startswith("http"): link = urljoin(url, href) else: link = href print link