def setUp(self): self.extractor = PDFTextExtractor() self.scanned = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/scanned.pdf'))) self.corrupt = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/corrupt.pdf'))) self.article = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/article.pdf'))) self.document = self.extractor.extract(self.article)
class TestPDFTextExtractor(unittest.TestCase): def setUp(self): self.extractor = PDFTextExtractor() self.scanned = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/scanned.pdf'))) self.corrupt = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/corrupt.pdf'))) self.article = normpath(join(dirname(__file__), ('../../../../tests/' 'fixtures/extraction/article.pdf'))) self.document = self.extractor.extract(self.article) def tearDown(self): pass def test_extract_non_existent_file(self): self.failUnlessRaises(IOError, self.extractor.extract, 'some_file.pdf') def test_extract_scanned_file(self): self.failUnlessRaises(ExtractionError, self.extractor.extract, self.scanned) def test_extract_corrupt_file(self): self.failUnlessRaises(ExtractionError, self.extractor.extract, self.corrupt) def test_metadata_extraction(self): self.failUnless(self.document.get_metadata_field('Title') == ('PII: ' 'S0925-2312(00)00293-9')) self.failUnless(self.document.get_metadata_field('CreationDate') == '20001019095743') def test_content_extraction(self): self.failUnless(self.document.content.count(('In this paper we discuss' ' the use of boundary methods')) == 1) self.failUnless(self.document.content.count(('Army Research Lab ' 'Programming Environment and Training program')) == 1)