def text(self):
        """
        Extract the raw plain text.

        Returns:
            str: The text content.
        """

        ft = self.libmagic_file_type

        # Empty:
        if ft == 'inode/x-empty':
            return None

        # Plaintext:
        elif ft == 'text/plain':
            with open(self.path, 'r') as fh:
                return fh.read()

        # HTML/XML:
        elif ft == 'text/html':
            return utils.html_text(self.path)

        # PDF:
        elif ft == 'application/pdf':
            return utils.pdf_text(self.path)

        # Everything else:
        else:
            return utils.docx_text(self.path)
    def text(self):

        """
        Extract the raw plain text.

        Returns:
            str: The text content.
        """

        ft = self.libmagic_file_type

        # Empty:
        if ft == 'inode/x-empty':
            return None

        # Plaintext:
        elif ft == 'text/plain':
            with open(self.path, 'r') as fh:
                return fh.read()

        # HTML/XML:
        elif ft == 'text/html':
            return utils.html_text(self.path)

        # PDF:
        elif ft == 'application/pdf':
            return utils.pdf_text(self.path)

        # Everything else:
        else:
            return utils.docx_text(self.path)
Example #3
0
def test_extract_text(mock_osp):

    """
    Text inside HTML tags should be extracted.
    """

    html = '<p>text</p>'

    path = mock_osp.add_file(content=html, ftype='html')
    text = html_text(path)
    assert text == 'text'
Example #4
0
def test_ignore_custom_tags(mock_osp):

    """
    Tags explicitly passed in `excluded` should be ignored.
    """

    html = """
    <h1>h1</h1>
    <h2>h2</h2>
    <h3>h3</h3>
    """

    path = mock_osp.add_file(content=html, ftype='html')
    text = html_text(path, ['h1', 'h2']).strip()
    assert text == 'h3'
Example #5
0
def test_ignore_scripts_and_styles(mock_osp):

    """
    By default, <script> and <style> tags should be ignored.
    """

    html = """
    <style>style</style>
    <script>script</script>
    <p>text</p>
    """

    path = mock_osp.add_file(content=html, ftype='html')
    text = html_text(path).strip()
    assert text == 'text'