def __getImageDocumentList(self):
        """
            Extract the images from a document and return a list of Grain instances
        """
        image_list=[]
        #get the elements in the tags draw:image, where the image references are kept
        tag_images = self.__parseContent.getElementsByTagName('draw:image')
        #checks if an image element exists
        if len(tag_images):
            for item in tag_images:
                name=None
                if item.hasAttribute("xlink:href"):
                    path=item.getAttribute('xlink:href')
                    #checks if the path is empty
                    if "Pictures" in path:
                        #remove the file extension
                        name=path.replace("Pictures/","")
                    elif "ObjectReplacements" in path:
                        name=path.replace("./ObjectReplacements/","")
                        # removes the "./" of the path that could be "./ObjectReplacements/Object 2"
                        path = path.replace("./","")

                    #  happens when it has an image from a website
                    elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path):
                        continue

                    if name is not None:
                        #checks the image extension
                        f, e = os.path.splitext(name)
                        if e.lower() in ['.png','.gif','.jpg']:
                            # verifies if the image is already in the list
                            if not name in [image.getId() for image in image_list]:
                                parent = item.parentNode
                                nChild = parent.nextSibling
                                objGran = Grain(graintype='image')
                                if nChild:
                                    text=[]
                                    caption = ''
                                    if nChild.nodeType is nChild.TEXT_NODE:
                                        text.append(nChild.data)
                                    for t in self.__getTextChildNodesImage(nChild,text):
                                        if t is not None: caption+=t
                                    objGran.setCaption(caption)
                                imagefile = StringIO(self.__zipFile.read(path))
                                objGran.setId(name)
                                objGran.setContent(imagefile)
                                image_list.append(objGran)
        if image_list:
            return image_list
        else:
            return []
Exemple #2
0
        outputXMLFolder = os.path.join(self.__pathFolder, "outputXMLFolder")
        try:
            converterObj = ExecuteConverter.ExecuteConverter()
            converterObj.extractTables(pdfFile, outputXMLFolder)
            tableListStr = converterObj.getTableList()
        except Exception, e:
            return tableList

        i = 0
        for table in tableListStr:
            # generate table name
            i += 1
            tableId = "Table" + str(i) + ".html"
            # finally, the Grain is created en added to the list
            grainObj = Grain(graintype='table')
            grainObj.setId(tableId)
            grainObj.setContent(StringIO(table))
            grainObj.setMimetype("text/html")
            tableList.append(grainObj)

        return tableList

    ### Public Methods ###

    def getThumbnailsDocument(self):
        """
            Extracts the metadata from pdf files using 'convert' tool
        """
        os.system(
            'evince-thumbnailer -s 128 "' +
            os.path.join(self.__pathFolder, self.Document.getFilename()) +
Exemple #3
0
        outputXMLFolder = os.path.join(self.__pathFolder,"outputXMLFolder")
        try:
            converterObj = ExecuteConverter.ExecuteConverter()
            converterObj.extractTables(pdfFile, outputXMLFolder)
            tableListStr = converterObj.getTableList()
        except Exception, e:
            return tableList

        i = 0
        for table in tableListStr:
            # generate table name
            i+=1
            tableId = "Table" + str(i) + ".html"
            # finally, the Grain is created en added to the list
            grainObj = Grain(graintype='table')
            grainObj.setId(tableId)
            grainObj.setContent(StringIO(table))
            grainObj.setMimetype("text/html")
            tableList.append(grainObj)

        return tableList

    ### Public Methods ###

    def getThumbnailsDocument(self):
        """
            Extracts the metadata from pdf files using 'convert' tool
        """
        os.system('evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder,self.Document.getFilename())  + '" ' + self.__pathFolder +'/thumbnail.png')
	file_content = StringIO(open(self.__pathFolder +'/thumbnail.png').read())
	os.remove(self.__pathFolder + '/thumbnail.png')
    def __getTableDocumentList(self):
        """
            Extract the tables from a document and return a list of Grain instances
        """
        table_list=[]
        # create an empty template
        template_str=self.__createNewOOoDocument()
        tables= self.__parseContent.getElementsByTagName('table:table')
        stylesDoc= self.__parseContent.getElementsByTagName('style:style')
        for t in tables:
            styles = self.__getAttributesR(t)
            table_name = t.getAttribute('table:name')
            imgHrefs=[]
            for img in t.getElementsByTagName("draw:image"):
                if img.hasAttribute("xlink:href"):
                    path=img.getAttribute('xlink:href')
                    #checks if the path is empty
                    if "ObjectReplacements" in path:
                        # remove th "./" of the path that could be "./ObjectReplacements/Object 2"
                        imgHrefs.append(path.replace("./",""))
                    # happens when it has an image from a website
                    elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path):
                        continue
                    else:
                        imgHrefs.append(path)

            # extract legend
            objGran = Grain(graintype='table')
            leg=[]
            p = t.previousSibling
            n = t.nextSibling
            if p is not None:
              if p.hasChildNodes():
                  legenda = ''
                  for i in self.__getTextChildNodesTable(p,text=[]):
                      legenda+=i
                  leg.append(legenda)
              else:
                  leg.append(self.__getNodeText(p))
            if n is not None:
                if n.hasChildNodes():
                    legenda = ''
                    for j in self.__getTextChildNodesTable(n,text=[]):
                        legenda+=j
                    leg.append(legenda)
                else:
                    leg.append(self.__getNodeText(n))

            # join the strings to make a single legend
            caption = ' '.join([ i for i in leg if i is not None])

            objGran.setCaption(caption)
            # Creating an empty File
            table_name = t.getAttribute('table:name')
            new_table = StringIO()
            new_table.write(template_str)
            template_odt = zipfile.PyZipFile(new_table,'a')
            doc = parseString(template_odt.read('content.xml'))
            template_odt.close()
            office_text=doc.getElementsByTagName('office:text')
            office_text=office_text[0]

            # copy the table node from a document to a new table grain
            newTableNo=doc.importNode(t,True)
            office_text.appendChild(newTableNo)

            for sty in stylesDoc:
                if (sty.getAttribute('style:name') in styles):
                    office_automatic_styles=doc.getElementsByTagName('office:automatic-styles')
                    office_automatic_styles=office_automatic_styles[0]
                    office_automatic_styles.appendChild(doc.importNode(sty,True))
            if imgHrefs:
                for image in imgHrefs:
                    template_odt = zipfile.PyZipFile(new_table,'a')
                    template_odt.writestr(str(image),self.__zipFile.read(image))
                    template_odt.close()
            template_odt = zipfile.PyZipFile(new_table,'a')
            template_odt.writestr('content.xml',doc.toxml().encode('utf-8'))
            template_odt.close()
            if table_name:
                #objGran.setId(plone_utils.normalizeString(table_name))
                objGran.setId(table_name)
                objGran.setContent(new_table)
                objGran.setMimetype("application/vnd.oasis.opendocument.text")
                table_list.append(objGran)
        if table_list:
            return table_list
        else:
            return []