def text(self): """ Extract the raw plain text. Returns: str: The text content. """ ft = self.libmagic_file_type # Empty: if ft == 'inode/x-empty': return None # Plaintext: elif ft == 'text/plain': with open(self.path, 'r') as fh: return fh.read() # HTML/XML: elif ft == 'text/html': return utils.html_text(self.path) # PDF: elif ft == 'application/pdf': return utils.pdf_text(self.path) # Everything else: else: return utils.docx_text(self.path)
def test_extract_text(mock_osp): """ Text inside HTML tags should be extracted. """ html = '<p>text</p>' path = mock_osp.add_file(content=html, ftype='html') text = html_text(path) assert text == 'text'
def test_ignore_custom_tags(mock_osp): """ Tags explicitly passed in `excluded` should be ignored. """ html = """ <h1>h1</h1> <h2>h2</h2> <h3>h3</h3> """ path = mock_osp.add_file(content=html, ftype='html') text = html_text(path, ['h1', 'h2']).strip() assert text == 'h3'
def test_ignore_scripts_and_styles(mock_osp): """ By default, <script> and <style> tags should be ignored. """ html = """ <style>style</style> <script>script</script> <p>text</p> """ path = mock_osp.add_file(content=html, ftype='html') text = html_text(path).strip() assert text == 'text'