def _is_pdf(self, http_resp): ''' :param http_resp: A http response object that contains a document of type HTML / PDF / WML / etc. :return: True if the document parameter is a string that contains a PDF document. ''' if http_resp.content_type in ('application/x-pdf', 'application/pdf'): document = http_resp.body # With the objective of avoiding this bug: # https://sourceforge.net/tracker/?func=detail&atid=853652&aid=2954220&group_id=170274 # I perform this safety check: if not document: return False # Some PDF files don't end with %%EOF, they end with # things like %%EOF\n , or %%EOF\r, or %%EOF\r\n. # So... just to be sure I search in the last 12 characters. if document.startswith('%PDF-') and '%%EOF' in document[-12:]: try: text = pdf_to_text(document) except Exception: return False else: return text != u'' return False
def _get_pdf_content(self, document_str): ''' Iterate through all PDF pages and extract text :return: A list containing the words in the PDF ''' pdf_text = pdf_to_text(document_str) return pdf_text.split()
def test_pdf_to_text_no_pdf(self): text = pdf_to_text('hello world') self.assertEqual('', text)
def test_pdf_to_text(self): text = pdf_to_text(file(self.SIMPLE_SAMPLE).read()) self.assertIn('Hello', text) self.assertIn('World', text)