def intro(self): if len(self.body) < 250: return self.body if not self.external_link: # thanks http://stackoverflow.com/questions/250357/smart-truncate-in-python intro = self.body[:250].rsplit(' ', 1)[0] intro += '...' intro = Cleaner(scripts=False, # disable it all except page_structure javascript=False, # as proper cleaning is done on save; comments=False, # here we just want to fix any links=False, # dangling tags caused by truncation meta=False, #page_stricture=True, embedded=False, frames=False, forms=False, annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False).clean_html(intro) return intro else: # woot http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python from HTMLParser import HTMLParser class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] self.opentags = 0 def handle_starttag(self, tag, attrs): if tag in ('script', 'style', 'title'): self.opentags = self.opentags + 1 def handle_endtag(self, tag): if tag in ('script', 'style', 'title'): self.opentags = self.opentags - 1 def handle_data(self, d): if self.opentags == 0: # blatent hack to beautify mailchimp weekly roundup imports if d not in ("Find out about what", "s new in EWB this week!", "|EWB", " Weekly Roundup", "Not displaying correctly?"): self.fed.append(d) def get_data(self): return ''.join(self.fed) s = MLStripper() s.feed(self.body) intro = s.get_data() intro = intro.replace("\n", "") intro = re.sub(r' +', ' ', intro) intro = intro[:250].rsplit(' ', 1)[0] intro += '...' return intro