converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i+=1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system('evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder,self.Document.getFilename()) + '" ' + self.__pathFolder +'/thumbnail.png') file_content = StringIO(open(self.__pathFolder +'/thumbnail.png').read()) os.remove(self.__pathFolder + '/thumbnail.png') return file_content
converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i += 1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system( 'evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder, self.Document.getFilename()) + '" ' + self.__pathFolder + '/thumbnail.png') file_content = StringIO(
def __getTableDocumentList(self): """ Extract the tables from a document and return a list of Grain instances """ table_list=[] # create an empty template template_str=self.__createNewOOoDocument() tables= self.__parseContent.getElementsByTagName('table:table') stylesDoc= self.__parseContent.getElementsByTagName('style:style') for t in tables: styles = self.__getAttributesR(t) table_name = t.getAttribute('table:name') imgHrefs=[] for img in t.getElementsByTagName("draw:image"): if img.hasAttribute("xlink:href"): path=img.getAttribute('xlink:href') #checks if the path is empty if "ObjectReplacements" in path: # remove th "./" of the path that could be "./ObjectReplacements/Object 2" imgHrefs.append(path.replace("./","")) # happens when it has an image from a website elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): continue else: imgHrefs.append(path) # extract legend objGran = Grain(graintype='table') leg=[] p = t.previousSibling n = t.nextSibling if p is not None: if p.hasChildNodes(): legenda = '' for i in self.__getTextChildNodesTable(p,text=[]): legenda+=i leg.append(legenda) else: leg.append(self.__getNodeText(p)) if n is not None: if n.hasChildNodes(): legenda = '' for j in self.__getTextChildNodesTable(n,text=[]): legenda+=j leg.append(legenda) else: leg.append(self.__getNodeText(n)) # join the strings to make a single legend caption = ' '.join([ i for i in leg if i is not None]) objGran.setCaption(caption) # Creating an empty File table_name = t.getAttribute('table:name') new_table = StringIO() new_table.write(template_str) template_odt = zipfile.PyZipFile(new_table,'a') doc = parseString(template_odt.read('content.xml')) template_odt.close() office_text=doc.getElementsByTagName('office:text') office_text=office_text[0] # copy the table node from a document to a new table grain newTableNo=doc.importNode(t,True) office_text.appendChild(newTableNo) for sty in stylesDoc: if (sty.getAttribute('style:name') in styles): office_automatic_styles=doc.getElementsByTagName('office:automatic-styles') office_automatic_styles=office_automatic_styles[0] office_automatic_styles.appendChild(doc.importNode(sty,True)) if imgHrefs: for image in imgHrefs: template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr(str(image),self.__zipFile.read(image)) template_odt.close() template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr('content.xml',doc.toxml().encode('utf-8')) template_odt.close() if table_name: #objGran.setId(plone_utils.normalizeString(table_name)) objGran.setId(table_name) objGran.setContent(new_table) objGran.setMimetype("application/vnd.oasis.opendocument.text") table_list.append(objGran) if table_list: return table_list else: return []