def test_get_text_from_non_leaf(self): soup = BeautifulSoup('<html><body>' '<div>' '<span>Text 01</span>' '<span>Text 02</span>' '</div>' '</html></body>') text = soup.findAll('div', text=True) self.failUnless(len(text) == 2) self.failUnless(text[0] == u'Text 01')
def clean_content(self, content): if not content: return None to_replace = { '\n': ' ', '\r': '', '\t': '', '<br>': ' ', '<br/>': ' ', '&': '&', '&': '&', '"': '"', '"': '"', '’': "'", ''': "'", ''': "'", '–': '-', ' ': ' ' } for key in to_replace: content = content.replace(key, to_replace[key]) # Remove consecutive whitespaces content = re.sub(' {2,}', ' ', content) content = re.sub('>( *)<', '><', content) content = BeautifulSoup(content) # Remove comments comments = content.findAll(text=lambda text: isinstance(text, Comment)) [element.extract() for element in comments] # Remove unnecessary HTML elements for tag in ['meta', 'link', 'style', 'script']: elements = content.findAll(tag) [element.extract() for element in elements] return content
def clean_content(self, content): if not content: return None to_replace = {'\n':' ', '\r':'', '\t':'', '<br>':' ', '<br/>':' ', '&':'&', '&':'&', '"':'"', '"':'"', '’':"'", ''':"'", ''':"'", '–':'-', ' ':' '} for key in to_replace: content = content.replace(key, to_replace[key]) # Remove consecutive whitespaces content = re.sub(' {2,}', ' ', content) content = re.sub('>( *)<', '><', content) content = BeautifulSoup(content) # Remove comments comments = content.findAll(text=lambda text:isinstance(text, Comment)) [element.extract() for element in comments] # Remove unnecessary HTML elements for tag in ['meta', 'link', 'style', 'script']: elements = content.findAll(tag) [element.extract() for element in elements] return content
def check_result_url(self, url, check_string): if url in self.results_cache.keys(): return self.results_cache[url] else: elements = None try: time.sleep(5) page = self.browser.get_page(url) page = self._clean_content(page) page = BeautifulSoup(page) elements = page.findAll(True, text=re.compile(check_string.lower())) except BrowserError, e: print 'ERROR: Browser error: %s' % e except Exception, e: print 'ERROR: Error checking error: %s' % e