def create_image_grains_list(self, imageList, timeList): returnList = [] for i, img in enumerate(imageList): filename="shot"+str(i)+".png" content = StringIO() img.save(content,"PNG") obj = Grain(id=filename, content=content, graintype='image') obj.description = str(timeList[i]) returnList.append(obj) return returnList
def granulate(self): """ granulate some svg file (if `max` is specified, at most in `max` files) """ max = self.max try: xmlobj = PrepareSVG() xml = xmlobj.removeUse(self.__svgfile.getvalue()) except ExpatError: print "\nERROR: Could not parse the svg file!\n" return [] doc = xml.documentElement num_tags = self.__countTags( xml) # number of element tags (except block tags) self.__remainder = self.__setImgsPerFile( doc, max, num_tags) # stores the remainder self.__svgTree = self.__createTreeBased(xml) self.__getAllDefinitions(xml) for child in doc.childNodes: self.__visitNode(self.__svgTree.documentElement, child) if self.__lastAdded: # if the last things aren't written self.__writeSvgTree() xml.unlink() self.__svgTree.unlink() grain_list = [] for grain in self.__list: new_grain = Grain(id='svg%s.svg' % (self.__list.index(grain) + 1), content=grain, mimetype='image/svg+xml', graintype='svg') grain_list.append(new_grain) return {'file_list': grain_list}
def __getImageDocumentList(self): """ Extract the images from a document and return a list of Grain instances """ image_list=[] #get the elements in the tags draw:image, where the image references are kept tag_images = self.__parseContent.getElementsByTagName('draw:image') #checks if an image element exists if len(tag_images): for item in tag_images: name=None if item.hasAttribute("xlink:href"): path=item.getAttribute('xlink:href') #checks if the path is empty if "Pictures" in path: #remove the file extension name=path.replace("Pictures/","") elif "ObjectReplacements" in path: name=path.replace("./ObjectReplacements/","") # removes the "./" of the path that could be "./ObjectReplacements/Object 2" path = path.replace("./","") # happens when it has an image from a website elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): continue if name is not None: #checks the image extension f, e = os.path.splitext(name) if e.lower() in ['.png','.gif','.jpg']: # verifies if the image is already in the list if not name in [image.getId() for image in image_list]: parent = item.parentNode nChild = parent.nextSibling objGran = Grain(graintype='image') if nChild: text=[] caption = '' if nChild.nodeType is nChild.TEXT_NODE: text.append(nChild.data) for t in self.__getTextChildNodesImage(nChild,text): if t is not None: caption+=t objGran.setCaption(caption) imagefile = StringIO(self.__zipFile.read(path)) objGran.setId(name) objGran.setContent(imagefile) image_list.append(objGran) if image_list: return image_list else: return []
def create_video_grains_list(self): returnList = [] video_grains_path = os.listdir(self.temporaryPathGrain) video_grains_path.sort() for i, video in enumerate(video_grains_path): filename="video_grain"+str(i)+".ogv" content = StringIO(open(self.temporaryPathGrain + "/" + video).read()) content.name = filename content.filename = filename obj = Grain(id=filename, content=content, graintype='nsifile') returnList.append(obj) return returnList
def __getImageDocumentList(self): """ Retrieves images from a PDF document """ if os.system( 'pdfimages -j "' + os.path.join(self.__pathFolder, self.Document.getFilename()) + '" ' + self.__pathFolder + '/imagegrain') == 256: #raise EOFError, "File has not the mandatory ending %EOF. File must be corrupted" return [] # Lists the content of the temporary folder where the files are in. images = os.listdir(self.__pathFolder) images.remove(self.Document.getFilename()) # Utiliza-se um algoritmo de descarte de imagens iguais resultImgListDict = comparaImage(self.__pathFolder) # Remove as imagens repetidas for imgDict in resultImgListDict: if imgDict.get('flag') is True: images.remove(imgDict.get('filename')) image_list = [] for image in images: f, e = os.path.splitext(image) #convert the images .ppm or .pbm to files .png if e.lower() in ['.ppm', '.pbm']: try: content = StringIO() PIL.Image.open(os.path.join(self.__pathFolder, image)).save(content, "PNG") image = f + ".png" except: fileImage = open(self.__pathFolder + '/' + image, "r") content = StringIO(fileImage.read()) fileImage.close() else: #XXX-In the variable 'images' is coming a directory, it # generates the error when trying to open directory as file. try: fileImage = open(self.__pathFolder + '/' + image, "r") except IOError, e: print e continue content = StringIO(fileImage.read()) fileImage.close() image_list.append( Grain(id=image, content=content, graintype='image'))
def extractRegion(self): content_file = self.tool.makeNewSvgStringIO( self.document.getFilename(), self.document.getData()) image = self.tool.makeNewSvgImage(content_file) new_content_file = self.tool.makeNewSvgStringIO( "new_" + self.document.getFilename(), StringIO.StringIO('')) new_svg = self.tool.makeNewSvgImage(new_content_file) region = Box(Point(self.x, self.y), self.w, self.h) new_image = self.tool.selectGrainsInRegion(region, image, new_svg) objGran = Grain(content=new_image.getContentFile(), mimetype="image/svg+xml", graintype='svg') return objGran
pdfFile = os.path.join(self.__pathFolder, self.Document.getFilename()) outputXMLFolder = os.path.join(self.__pathFolder, "outputXMLFolder") try: converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i += 1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system( 'evince-thumbnailer -s 128 "' +
pdfFile = os.path.join(self.__pathFolder,self.Document.getFilename()) outputXMLFolder = os.path.join(self.__pathFolder,"outputXMLFolder") try: converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i+=1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system('evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder,self.Document.getFilename()) + '" ' + self.__pathFolder +'/thumbnail.png') file_content = StringIO(open(self.__pathFolder +'/thumbnail.png').read())
def __getTableDocumentList(self): """ Extract the tables from a document and return a list of Grain instances """ table_list=[] # create an empty template template_str=self.__createNewOOoDocument() tables= self.__parseContent.getElementsByTagName('table:table') stylesDoc= self.__parseContent.getElementsByTagName('style:style') for t in tables: styles = self.__getAttributesR(t) table_name = t.getAttribute('table:name') imgHrefs=[] for img in t.getElementsByTagName("draw:image"): if img.hasAttribute("xlink:href"): path=img.getAttribute('xlink:href') #checks if the path is empty if "ObjectReplacements" in path: # remove th "./" of the path that could be "./ObjectReplacements/Object 2" imgHrefs.append(path.replace("./","")) # happens when it has an image from a website elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): continue else: imgHrefs.append(path) # extract legend objGran = Grain(graintype='table') leg=[] p = t.previousSibling n = t.nextSibling if p is not None: if p.hasChildNodes(): legenda = '' for i in self.__getTextChildNodesTable(p,text=[]): legenda+=i leg.append(legenda) else: leg.append(self.__getNodeText(p)) if n is not None: if n.hasChildNodes(): legenda = '' for j in self.__getTextChildNodesTable(n,text=[]): legenda+=j leg.append(legenda) else: leg.append(self.__getNodeText(n)) # join the strings to make a single legend caption = ' '.join([ i for i in leg if i is not None]) objGran.setCaption(caption) # Creating an empty File table_name = t.getAttribute('table:name') new_table = StringIO() new_table.write(template_str) template_odt = zipfile.PyZipFile(new_table,'a') doc = parseString(template_odt.read('content.xml')) template_odt.close() office_text=doc.getElementsByTagName('office:text') office_text=office_text[0] # copy the table node from a document to a new table grain newTableNo=doc.importNode(t,True) office_text.appendChild(newTableNo) for sty in stylesDoc: if (sty.getAttribute('style:name') in styles): office_automatic_styles=doc.getElementsByTagName('office:automatic-styles') office_automatic_styles=office_automatic_styles[0] office_automatic_styles.appendChild(doc.importNode(sty,True)) if imgHrefs: for image in imgHrefs: template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr(str(image),self.__zipFile.read(image)) template_odt.close() template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr('content.xml',doc.toxml().encode('utf-8')) template_odt.close() if table_name: #objGran.setId(plone_utils.normalizeString(table_name)) objGran.setId(table_name) objGran.setContent(new_table) objGran.setMimetype("application/vnd.oasis.opendocument.text") table_list.append(objGran) if table_list: return table_list else: return []