Ejemplo n.º 1
0
    def test_non_ocr_pdf(self) :
        """
            Access an valid pdf which hasn't been OCR'd
        """
        file_name = 'non_ocr_file.pdf'

        text_extractor = TextExtractor(
            source_file = file_name,
            source_directory= TextExtractorTest.test_directory,
            working_directory = '/tmp',
            testing = True)

        actual_results = text_extractor.get_file_contents_as_array()

        self.assertEquals(len(actual_results), 0)
Ejemplo n.º 2
0
    def test_valid_pdf(self) :
        """
            Access an empty file with .pdf suffix
            This could well break when we test the file type properly
        """

        expected_results = [
            'Test 1\n',
            'Test 2\n',
            '\n'
        ]
        file_name = 'test_file1.pdf'

        text_extractor = TextExtractor(
            source_file = file_name,
            source_directory= TextExtractorTest.test_directory,
            working_directory = '/tmp',
            testing = True)

        actual_results = text_extractor.get_file_contents_as_array()

        self.assertEquals(expected_results, actual_results)