Esempio n. 1
0
 def testGetTablesImagesFromProtectPDF(self):
   """Test if protect pdf returns False when try to extract tables and images"""
   data = open('./data/test_protect.pdf').read()
   pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw)
   tables_matrix = pdfgranulator.getTablesMatrix()
   image_list = pdfgranulator.getImageItemList()
   self.assertEqual(tables_matrix, False)
   self.assertEqual(image_list, False)
Esempio n. 2
0
 def testGetImageItemList(self):
   """Test if getImageItemList() returns the right images list"""
   data = open('./data/test.pdf').read()
   pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw)
   image_list = pdfgranulator.getImageItemList()
   self.assertEquals(image_list[0][0], '001-pag001.png')
   self.assertEquals(image_list[-1][0], '012-pag004.jpg')
   self.assertEquals(len(image_list), 12)
Esempio n. 3
0
 def testGetTable(self):
   """Test if getTable() returns table by id"""
   data = open('./data/granulate_test.pdf').read()
   pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw)
   table = pdfgranulator.getTable('Tabela 1 - pag 1')
   self.assertEqual(table, '<html><body><h1> Tabela 1 - pag 1 </h1><table>'+
   '<tr><td> Name </td><td> Phone </td><td> Email </td></tr>'+
   '<tr><td> Hugo </td><td> +55 (22) 8888-8888 </td><td> [email protected] </td></tr>'+
   '<tr><td> Rafael </td><td>+55 (22) 9999-9999 </br>+55 (22) 9999-9999 '+
   '</br>+55 (22) 9999-9999 </br></td><td> [email protected] </td></tr></table></body></html>')
Esempio n. 4
0
 def testGetTablesMatrix(self):
   """Test if getTablesMatrix() returns matrix with all tables"""
   data = open('./data/granulate_test.pdf').read()
   pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw)
   tables_matrix = pdfgranulator.getTablesMatrix()
   self.assertEqual(tables_matrix,
   {'Tabela 2: Soccer Teams - pag 2': [
       ['Name', 'Country'], ['Goytacaz', 'Brazil'], ['PSG', 'France'], ['Chelsea', 'England '], ['Barcelona', 'Spain']], 
     'Tabela 1 - pag 1': [
       ['Name', 'Phone', 'Email'], 
       ['Hugo', '+55 (22) 8888-8888', '*****@*****.**'], 
       ['Rafael', ['+55 (22) 9999-9999', '+55 (22) 9999-9999', '+55 (22) 9999-9999'], '*****@*****.**']], 
     'Table 1: Prices table from Mon Restaurant - pag 1': [
       ['Product', 'Price'], ['Pizza', 'R$ 25,00'], ['Petit Gateau', 'R$ 10,00'], ['Feijoada', 'R$ 30,00']]})
Esempio n. 5
0
  def granulateFile(self, data, source_format="odt"):
    """This function allows BD NSI's project to completely granulate 
    document file"""
    if source_format.lower() == "pdf":
      pdfgranulator = PDFGranulator(self._path_tmp_dir, decodestring(data), 'pdf',
                                **self.kw)
      table_list = pdfgranulator.getTableItemList()
      grains = []
      if table_list != 'PDF Protect or have no Table Item List':
        tables = []
        for item in table_list:
          table = pdfgranulator.getTable(item)
          tables.append(table)
        grains = map(encodestring, tables)
      images = pdfgranulator.getImageItemList()
      if images != False:
        # XXX - encodestring cant convert list
        grains += map(encodestring, str(images))

      # XXX - if has no grains
      if grains == []:
        return "This PDF is protect or has no grains"
      return grains
    
    else:
      try:
        document = self._getOOGranulator(data, source_format)
        table_list = document.getTableItemList()
        tables = []
        for table in table_list:
          tables.append(document.getTable(table[0], 'odt'))
        grains = tables
        image_list = document.getImageItemList()
        images = []
        for image in image_list:
          images.append(document.getImage(image[0]))
        grains += image
        return map(encodestring,grains)
      except:
        raise NotImplementedError
Esempio n. 6
0
 def testGetTableItemList(self):
   """Test if getTableItemList() returns list of tables' id"""
   data = open('./data/granulate_test.pdf').read()
   pdfgranulator = PDFGranulator(self.tmp_url, data, 'pdf', **self.kw)
   table_list = pdfgranulator.getTableItemList()
   self.assertEqual(table_list, ['Tabela 2: Soccer Teams - pag 2', 'Tabela 1 - pag 1', 'Table 1: Prices table from Mon Restaurant - pag 1'])