def extract(self, input_file): input_file = self._check_input_file(input_file) document = Document() file = open(input_file) document.content = file.read() file.close() return document
class TestPDFTextExtractor(unittest.TestCase): def setUp(self): self.document = Document() def tearDown(self): pass def test_metadata_fields(self): self.document.set_metadata_field('Name', 'Document name') self.failUnless( self.document.get_metadata_field('Name') == 'Document name') def test_available_metadata(self): self.document.set_metadata_field('Name', 'Document name') self.document.set_metadata_field('CreationDate', 'Today') fields = self.document.available_metadata self.failUnless(len(fields) == 2) self.failUnless(fields.count('Name') == 1) self.failUnless(fields.count('CreationDate') == 1) def test_content(self): self.document.content = "Some text content" self.failUnless(self.document.content == "Some text content")
class TestPDFTextExtractor(unittest.TestCase): def setUp(self): self.document = Document() def tearDown(self): pass def test_metadata_fields(self): self.document.set_metadata_field('Name', 'Document name') self.failUnless(self.document.get_metadata_field('Name') == 'Document name') def test_available_metadata(self): self.document.set_metadata_field('Name', 'Document name') self.document.set_metadata_field('CreationDate', 'Today') fields = self.document.available_metadata self.failUnless(len(fields) == 2) self.failUnless(fields.count('Name') == 1) self.failUnless(fields.count('CreationDate') == 1) def test_content(self): self.document.content = "Some text content" self.failUnless(self.document.content == "Some text content")
def extract(self, input_file): input_file = self._check_input_file(input_file) # Extraction command and its options. They may be parametrized in the # future command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2', '-enc', 'ASCII7', '-htmlmeta', input_file, '-'] try: pop = subprocess.Popen(command, stdout=subprocess.PIPE) except subprocess.CalledProcessError as cpe: log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable + repr(cpe.returncode)) except OSError: log.error ('PDF extraction tool not found') #@UndefinedVariable stdout = pop.communicate()[0] if not stdout: raise ExtractionError('Corrupted file') parser = BeautifulSoup(stdout) document = Document() self._extract_metadata(parser, document) self._extract_content(parser, document) return document
def setUp(self): self.document = Document()