Example #1
0
    def _is_pdf(self, http_resp):
        '''
        :param http_resp: A http response object that contains a document of
                          type HTML / PDF / WML / etc.

        :return: True if the document parameter is a string that contains a PDF
                 document.
        '''
        if http_resp.content_type in ('application/x-pdf', 'application/pdf'):
            document = http_resp.body

            #   With the objective of avoiding this bug:
            #   https://sourceforge.net/tracker/?func=detail&atid=853652&aid=2954220&group_id=170274
            #   I perform this safety check:
            if not document:
                return False

            #   Some PDF files don't end with %%EOF, they end with
            #   things like %%EOF\n , or %%EOF\r, or %%EOF\r\n.
            #   So... just to be sure I search in the last 12 characters.
            if document.startswith('%PDF-') and '%%EOF' in document[-12:]:
                try:
                    text = pdf_to_text(document)
                except Exception:
                    return False
                else:
                    return text != u''

        return False
Example #2
0
 def _get_pdf_content(self, document_str):
     '''
     Iterate through all PDF pages and extract text
     
     :return: A list containing the words in the PDF
     '''
     pdf_text = pdf_to_text(document_str)
     return pdf_text.split()
Example #3
0
File: pdf.py Project: weisst/w3af
 def _get_pdf_content(self, document_str):
     '''
     Iterate through all PDF pages and extract text
     
     :return: A list containing the words in the PDF
     '''
     pdf_text = pdf_to_text(document_str)
     return pdf_text.split()
Example #4
0
 def test_pdf_to_text_no_pdf(self):
     text = pdf_to_text('hello world')
     self.assertEqual('', text)
Example #5
0
 def test_pdf_to_text(self):
     text = pdf_to_text(file(self.SIMPLE_SAMPLE).read())
     self.assertIn('Hello', text)
     self.assertIn('World', text)