def test_strip_javascript(self): # Assert strip <script> elements. v = web.strip_javascript(" <script type=\"text/javascript\">text</script> ") self.assertEqual(v, " ") print "pattern.web.strip_javascript()"
if p.startswith("-"): p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a class="twitter-share-button"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline) html = html.replace('src="/media/', 'src="../g/') # images (offline) html = html.replace('src="/sites/all/themes/clips/g/', 'src="../g/') # images (offline) html = html.replace('href="/media/', 'href="%smedia/' % url.replace("pages/", "")) # downloads (online) # Apply the simplified template + set page titles.