def testGetTablesImagesFromProtectPDF(self): """Test if protect pdf returns False when try to extract tables and images""" data = open('./data/test_protect.pdf').read() pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw) tables_matrix = pdfgranulator.getTablesMatrix() image_list = pdfgranulator.getImageItemList() self.assertEqual(tables_matrix, False) self.assertEqual(image_list, False)
def testGetImageItemList(self): """Test if getImageItemList() returns the right images list""" data = open('./data/test.pdf').read() pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw) image_list = pdfgranulator.getImageItemList() self.assertEquals(image_list[0][0], '001-pag001.png') self.assertEquals(image_list[-1][0], '012-pag004.jpg') self.assertEquals(len(image_list), 12)
def testGetTable(self): """Test if getTable() returns table by id""" data = open('./data/granulate_test.pdf').read() pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw) table = pdfgranulator.getTable('Tabela 1 - pag 1') self.assertEqual(table, '<html><body><h1> Tabela 1 - pag 1 </h1><table>'+ '<tr><td> Name </td><td> Phone </td><td> Email </td></tr>'+ '<tr><td> Hugo </td><td> +55 (22) 8888-8888 </td><td> [email protected] </td></tr>'+ '<tr><td> Rafael </td><td>+55 (22) 9999-9999 </br>+55 (22) 9999-9999 '+ '</br>+55 (22) 9999-9999 </br></td><td> [email protected] </td></tr></table></body></html>')
def testGetTablesMatrix(self): """Test if getTablesMatrix() returns matrix with all tables""" data = open('./data/granulate_test.pdf').read() pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw) tables_matrix = pdfgranulator.getTablesMatrix() self.assertEqual(tables_matrix, {'Tabela 2: Soccer Teams - pag 2': [ ['Name', 'Country'], ['Goytacaz', 'Brazil'], ['PSG', 'France'], ['Chelsea', 'England '], ['Barcelona', 'Spain']], 'Tabela 1 - pag 1': [ ['Name', 'Phone', 'Email'], ['Hugo', '+55 (22) 8888-8888', '*****@*****.**'], ['Rafael', ['+55 (22) 9999-9999', '+55 (22) 9999-9999', '+55 (22) 9999-9999'], '*****@*****.**']], 'Table 1: Prices table from Mon Restaurant - pag 1': [ ['Product', 'Price'], ['Pizza', 'R$ 25,00'], ['Petit Gateau', 'R$ 10,00'], ['Feijoada', 'R$ 30,00']]})
def granulateFile(self, data, source_format="odt"): """This function allows BD NSI's project to completely granulate document file""" if source_format.lower() == "pdf": pdfgranulator = PDFGranulator(self._path_tmp_dir, decodestring(data), 'pdf', **self.kw) table_list = pdfgranulator.getTableItemList() grains = [] if table_list != 'PDF Protect or have no Table Item List': tables = [] for item in table_list: table = pdfgranulator.getTable(item) tables.append(table) grains = map(encodestring, tables) images = pdfgranulator.getImageItemList() if images != False: # XXX - encodestring cant convert list grains += map(encodestring, str(images)) # XXX - if has no grains if grains == []: return "This PDF is protect or has no grains" return grains else: try: document = self._getOOGranulator(data, source_format) table_list = document.getTableItemList() tables = [] for table in table_list: tables.append(document.getTable(table[0], 'odt')) grains = tables image_list = document.getImageItemList() images = [] for image in image_list: images.append(document.getImage(image[0])) grains += image return map(encodestring,grains) except: raise NotImplementedError
def testGetTableItemList(self): """Test if getTableItemList() returns list of tables' id""" data = open('./data/granulate_test.pdf').read() pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw) table_list = pdfgranulator.getTableItemList() self.assertEqual(table_list, ['Tabela 2: Soccer Teams - pag 2', 'Tabela 1 - pag 1', 'Table 1: Prices table from Mon Restaurant - pag 1'])