def test_strip_element(self): # Assert strip <p> elements. v = web.strip_element(" <p><p></p>text</p> <b><P></P></b>", "p") self.assertEqual(v, " <b></b>") print "pattern.web.strip_element()"
def _plaintext(string): """ Stole this function in slightmy modified form from pattern where it lives as a class method: """ """ Strips HTML tags, whitespace and wiki markup from the HTML source, including: metadata, info box, table of contents, annotations, thumbnails, disambiguation link. This is called internally from MediaWikiArticle.string. """ s = string # Strip meta <table> elements. s = strip_element(s, "table", "id=\"toc") # Table of contents. s = strip_element(s, "table", "class=\"infobox") # Infobox. s = strip_element(s, "table", "class=\"navbox") # Navbox. s = strip_element(s, "table", "class=\"mbox") # Message. s = strip_element(s, "table", "class=\"metadata") # Metadata. s = strip_element(s, "table", "class=\".*?wikitable") # Table. s = strip_element(s, "table", "class=\"toc") # Table (usually footer). # Strip meta <div> elements. s = strip_element(s, "div", "id=\"toc") # Table of contents. s = strip_element(s, "div", "class=\"infobox") # Infobox. s = strip_element(s, "div", "class=\"navbox") # Navbox. s = strip_element(s, "div", "class=\"mbox") # Message. s = strip_element(s, "div", "class=\"metadata") # Metadata. s = strip_element(s, "div", "id=\"annotation") # Annotations. s = strip_element(s, "div", "class=\"dablink") # Disambiguation message. s = strip_element(s, "div", "class=\"magnify") # Thumbnails. s = strip_element(s, "div", "class=\"thumb ") # Thumbnail captions. s = strip_element(s, "div", "class=\"barbox") # Bar charts. s = strip_element(s, "div", "class=\"mw-headline") # Bar charts. s = strip_element(s, "div", "class=\"noprint") # Hidden from print. s = strip_element(s, "sup", "class=\"noprint") # Strip absolute elements (don't know their position). s = strip_element(s, "div", "style=\"position:absolute") # Strip meta <span> elements. s = strip_element(s, "span", "class=\"error") # Strip math formulas, add [math] placeholder. s = re.sub(r"<img class=\"tex\".*?/>", "[math]", s) # LaTex math images. s = plaintext(s) # Strip [edit] link (language dependent.) s = re.sub(r"\[edit\]\s*", "", s) s = re.sub(r"\[bewerken\]\s*", "", s) # Insert space before inline references. s = s.replace("[", " [").replace(" [", " [") # ignore lists? #s = " ".join([line.strip() for line in s.split('\n') if not line.strip().startswith('*')]) return s