def extract(self, input_file):
     input_file = self._check_input_file(input_file)
     
     document = Document()
     file = open(input_file)
     document.content = file.read()
     file.close()
     
     return document
Example #2
0
class TestPDFTextExtractor(unittest.TestCase):
    def setUp(self):
        self.document = Document()

    def tearDown(self):
        pass

    def test_metadata_fields(self):
        self.document.set_metadata_field('Name', 'Document name')
        self.failUnless(
            self.document.get_metadata_field('Name') == 'Document name')

    def test_available_metadata(self):
        self.document.set_metadata_field('Name', 'Document name')
        self.document.set_metadata_field('CreationDate', 'Today')
        fields = self.document.available_metadata
        self.failUnless(len(fields) == 2)
        self.failUnless(fields.count('Name') == 1)
        self.failUnless(fields.count('CreationDate') == 1)

    def test_content(self):
        self.document.content = "Some text content"
        self.failUnless(self.document.content == "Some text content")
class TestPDFTextExtractor(unittest.TestCase):

    def setUp(self):
        self.document = Document()
        
    def tearDown(self):
        pass

    def test_metadata_fields(self):
        self.document.set_metadata_field('Name', 'Document name')
        self.failUnless(self.document.get_metadata_field('Name') == 
                        'Document name')
    
    def test_available_metadata(self):
        self.document.set_metadata_field('Name', 'Document name')
        self.document.set_metadata_field('CreationDate', 'Today')
        fields = self.document.available_metadata
        self.failUnless(len(fields) == 2)
        self.failUnless(fields.count('Name') == 1)
        self.failUnless(fields.count('CreationDate') == 1)

    def test_content(self):
        self.document.content = "Some text content"
        self.failUnless(self.document.content == "Some text content")
    def extract(self, input_file):
        input_file = self._check_input_file(input_file)
        # Extraction command and its options. They may be parametrized in the
        # future
        command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2',
                   '-enc', 'ASCII7', '-htmlmeta', input_file, '-']
        try:
            pop = subprocess.Popen(command, stdout=subprocess.PIPE)
        except subprocess.CalledProcessError as cpe:
            log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable
                   + repr(cpe.returncode))
        except OSError:
            log.error ('PDF extraction tool not found') #@UndefinedVariable
        
        stdout = pop.communicate()[0]
        if not stdout:
            raise ExtractionError('Corrupted file')
        
        parser = BeautifulSoup(stdout)
        document = Document()
        self._extract_metadata(parser, document)
        self._extract_content(parser, document)

        return document
 def setUp(self):
     self.document = Document()
Example #6
0
 def setUp(self):
     self.document = Document()