def __split_models(self, xmlDoc): """generator that takes parameter xmlDoc and splits it into many xml files, with only one model per each""" elem = XML(xmlDoc) models = elem.find("Models") if models: elem.remove(models) for model in models: to_return = copy.deepcopy(elem) new_models = Element("Models") for a in models.attrib: new_models.attrib[a] = models.attrib[a] new_models.append(model) to_return.append(new_models) yield (model.attrib['id'], to_return) else: pass #TODO return error
def parsePodcast(podcastXML, config, filemode=None): """ Access to the podcast and return all information in a podcast object return None when no info available, when error raise MyCancel """ podcastInfo = PodcastInfo() podcastNode = XML(podcastXML) channelNode = podcastNode.find('channel') ### TODO handle when title is empty, use the program title titlePodcast = channelNode.findtext('title', '') titlePodcastAscii = titlePodcast.encode('ascii', 'ignore') if not titlePodcast: titlePodcast = 'UNDEFINED' titlePodcastAscii = 'UNDEFINED' else: titlePodcastAscii = getCroppedFilename(titlePodcastAscii) titlePodcastAscii = cleanString(titlePodcastAscii) ## TODO support the podcast named the same way... add a hashcode after the title, and a main podcast.xml file at the root # the target local directory targetDirectory = os.path.join(config.podcastDownloadPath, titlePodcastAscii) chandescription = channelNode.findtext('description', '') #chanImage = getXMLAttrText(channeldom, 'itunes:image', 'href') #### podcastInfo.title = titlePodcast podcastInfo.description = chandescription #podcastInfo.image = chanImage podcastInfo.titleAscii = titlePodcastAscii podcastInfo.targetDirectory = targetDirectory # Get the local info, and a link on the elemnt node, to be modified later podcastLocalInfo = getPodcastLocalItems(podcastInfo) # parse the item list #items = channeldom.getElementsByTagName('item') itemExist = channelNode.find('item') != None if not itemExist: # return empty mark return None #for item in items: for itemNode in channelNode.getiterator('item'): descr = '' type = '' length = '' title = itemNode.findtext('title', '') descr = itemNode.findtext('description', '') #pubDate = getXMLTagText(item, 'pubDate') #duration = getXMLTagText(item, 'itunes:duration') enclosureNode = itemNode.find('enclosure') if enclosureNode == None: continue # the url can be redirect, urllib follow this link for downlaod url = enclosureNode.get('url') if url == None: # when no url, continue continue # search in the local if the file is already here, already downloaded, and add it foundLocalItem = None for podlocalitem in podcastLocalInfo.itemsInfo: if podlocalitem.url == url: foundLocalItem = podlocalitem if foundLocalItem != None: podcastInfo.itemsInfo.append(foundLocalItem) continue # type is not always defined?!, can test with urllib when downloaded type = enclosureNode.get('type') # Length only used for information as list, exact size is found later during download length = enclosureNode.get('length') ##################### podItem = PodcastItem() podItem.title = title podItem.description = descr podItem.url = url podItem.type = type podItem.length = length podcastInfo.itemsInfo.append(podItem) # TODO ?? return when ! config.podcastDownload # init the titles and the filename, filelocation for podItem in podcastInfo.itemsInfo: # the display title title = podItem.title # when is local, don't process filenames and add >> in the titles if podItem.isLocal: # control that the file exist and fully downloaded, when not try to download ... if not os.path.exists(podItem.fileLocation): podItem.isLocal = False podItem.flagfinish = False elif podItem.flagfinish: title = '>> ' + title else: title = '<> ' + title # when not local, or not file found else: # TODO test that valid name, no special char, len > sufficient, and not always the same name, # TODO OR add something in the MEDIA xml file ## TODO when a podcast title is many time the same, use a hascode after the title filaname ### PROBELM avec la limite des path: lorsque fichier trop long, peut avec des equivalent...... # podcastInfo.useTitleForName or if filemode == "title": filename = podItem.title podcastInfo.useTitleForName else: filename = getLastStringPart( podItem.url.encode('ascii', 'ignore'), '/') filename = getBeforeStringPart(filename, '?') ### TODO test if this filename already exit in the list..?? -> if YES MARK it using a poditem flag, and use title... filename = getCroppedFilename(filename) # 42-4 filename = cleanString(filename) fileLocation = targetDirectory + '\\' + filename # set the properties podItem.filename = filename podItem.fileLocation = fileLocation # When the file exist but no entry in the xml add it here if os.path.exists(podItem.fileLocation): title = '>? ' + title # process size size = 0 if podItem.size != 0: size = podItem.size elif podItem.length and len(podItem.length) > 0: size = round(long(podItem.length) / 1000000.0, 1) # set size and title if size != 0: podItem.size = size title = title + ' (' + str(size) + 'Mo)' podcastInfo.itemFilenames.append(podItem.filename) podcastInfo.titles2display.append(title) # search in the folder if already downloaded file are available, and not refferenced in the xml appendLocalMedia(podcastInfo, podcastLocalInfo) # return the podcast info with items return podcastInfo
def parsePodcast(podcastXML, config, filemode=None): """ Access to the podcast and return all information in a podcast object return None when no info available, when error raise MyCancel """ podcastInfo = PodcastInfo() podcastNode = XML(podcastXML) channelNode = podcastNode.find('channel') ### TODO handle when title is empty, use the program title titlePodcast = channelNode.findtext('title', '') titlePodcastAscii = titlePodcast.encode('ascii', 'ignore') if not titlePodcast: titlePodcast = 'UNDEFINED' titlePodcastAscii = 'UNDEFINED' else: titlePodcastAscii = getCroppedFilename(titlePodcastAscii) titlePodcastAscii = cleanString(titlePodcastAscii) ## TODO support the podcast named the same way... add a hashcode after the title, and a main podcast.xml file at the root # the target local directory targetDirectory = os.path.join(config.podcastDownloadPath, titlePodcastAscii) chandescription = channelNode.findtext('description', '') #chanImage = getXMLAttrText(channeldom, 'itunes:image', 'href') #### podcastInfo.title = titlePodcast podcastInfo.description = chandescription #podcastInfo.image = chanImage podcastInfo.titleAscii = titlePodcastAscii podcastInfo.targetDirectory = targetDirectory # Get the local info, and a link on the elemnt node, to be modified later podcastLocalInfo = getPodcastLocalItems(podcastInfo) # parse the item list #items = channeldom.getElementsByTagName('item') itemExist = channelNode.find('item') != None if not itemExist: # return empty mark return None #for item in items: for itemNode in channelNode.getiterator('item'): descr = '' type = '' length = '' title = itemNode.findtext('title', '') descr = itemNode.findtext('description', '') #pubDate = getXMLTagText(item, 'pubDate') #duration = getXMLTagText(item, 'itunes:duration') enclosureNode = itemNode.find('enclosure') if enclosureNode == None: continue # the url can be redirect, urllib follow this link for downlaod url = enclosureNode.get('url') if url == None: # when no url, continue continue # search in the local if the file is already here, already downloaded, and add it foundLocalItem = None for podlocalitem in podcastLocalInfo.itemsInfo: if podlocalitem.url == url: foundLocalItem = podlocalitem if foundLocalItem != None: podcastInfo.itemsInfo.append(foundLocalItem) continue # type is not always defined?!, can test with urllib when downloaded type = enclosureNode.get('type') # Length only used for information as list, exact size is found later during download length = enclosureNode.get('length') ##################### podItem = PodcastItem() podItem.title = title podItem.description = descr podItem.url = url podItem.type = type podItem.length = length podcastInfo.itemsInfo.append(podItem) # TODO ?? return when ! config.podcastDownload # init the titles and the filename, filelocation for podItem in podcastInfo.itemsInfo : # the display title title = podItem.title # when is local, don't process filenames and add >> in the titles if podItem.isLocal: # control that the file exist and fully downloaded, when not try to download ... if not os.path.exists(podItem.fileLocation) : podItem.isLocal = False podItem.flagfinish = False elif podItem.flagfinish: title = '>> ' + title else: title = '<> ' + title # when not local, or not file found else : # TODO test that valid name, no special char, len > sufficient, and not always the same name, # TODO OR add something in the MEDIA xml file ## TODO when a podcast title is many time the same, use a hascode after the title filaname ### PROBELM avec la limite des path: lorsque fichier trop long, peut avec des equivalent...... # podcastInfo.useTitleForName or if filemode == "title": filename = podItem.title podcastInfo.useTitleForName else: filename = getLastStringPart(podItem.url.encode('ascii', 'ignore'), '/') filename = getBeforeStringPart(filename, '?') ### TODO test if this filename already exit in the list..?? -> if YES MARK it using a poditem flag, and use title... filename = getCroppedFilename(filename) # 42-4 filename = cleanString(filename) fileLocation = targetDirectory + '\\' + filename # set the properties podItem.filename = filename podItem.fileLocation = fileLocation # When the file exist but no entry in the xml add it here if os.path.exists(podItem.fileLocation) : title = '>? ' + title # process size size = 0 if podItem.size != 0 : size = podItem.size elif podItem.length and len(podItem.length) > 0: size = round(long(podItem.length) / 1000000.0 , 1) # set size and title if size != 0: podItem.size = size title = title + ' (' + str(size) + 'Mo)' podcastInfo.itemFilenames.append(podItem.filename) podcastInfo.titles2display.append(title) # search in the folder if already downloaded file are available, and not refferenced in the xml appendLocalMedia(podcastInfo, podcastLocalInfo) # return the podcast info with items return podcastInfo
class EpubDocument(object): """A class that parses and provides data about an ePub file""" def __init__(self, fname): # This is done according to this: # http://stackoverflow.com/questions/1388467/reading-epub-format print(("Opening:", fname)) try: self.book = zipfile.ZipFile(fname, "r") except zipfile.BadZipfile: raise ValueError("Invalid format") f = self.book.open('META-INF/container.xml') self.container = XML(f.read()) f.close() roots = self.container.findall( './/{urn:oasis:names:tc:opendocument:xmlns:container}rootfile') self.roots = [] for r in roots: self.roots.append(r.attrib['full-path']) opf = self.book.open(self.roots[0]) self.basepath = os.path.dirname(self.roots[0]) + "/" if self.basepath == '/': self.basepath = "" print(("BASEPATH:", self.basepath)) data = opf.read() self.opf = XML(data) opf.close() self.manifest = self.opf.find('{http://www.idpf.org/2007/opf}manifest') self.manifest_dict = {} for elem in self.manifest.findall( '{http://www.idpf.org/2007/opf}item'): self.manifest_dict[elem.attrib['id']] = self.basepath + \ elem.attrib['href'] self.spine = self.opf.find('{http://www.idpf.org/2007/opf}spine') self.tocentries = [] self.toc_id = self.spine.attrib.get('toc', None) if self.toc_id: self.toc_fn = self.manifest_dict[self.toc_id] print(("TOC:", self.toc_fn)) f = self.book.open(self.toc_fn) data = f.read() self.toc = XML(data) self.navmap = self.toc.find( '{http://www.daisy.org/z3986/2005/ncx/}navMap') # FIXME: support nested navpoints self.navpoints = self.navmap.findall( './/{http://www.daisy.org/z3986/2005/ncx/}navPoint') for np in self.navpoints: label = np.find( '{http://www.daisy.org/z3986/2005/ncx/}navLabel').find( '{http://www.daisy.org/z3986/2005/ncx/}text').text content = np.find( '{http://www.daisy.org/z3986/2005/ncx/}content').attrib['src'] if label and content: self.tocentries.append([label, content]) self.itemrefs = self.spine.findall( '{http://www.idpf.org/2007/opf}itemref') print(("IR:", self.itemrefs)) self.spinerefs = [ self.manifest_dict[item.attrib['idref']][len(self.basepath):] for item in self.itemrefs] # I found one book that has a spine but no navmap: # "Der schwarze Baal" from manybooks.net # Also another has more entries on the spine than on the navmap # (Dinosauria, from feedbooks). # So, we need to merge these suckers. I will assume it's not completely # insane and the spine is always more complete. spinerefs2 = [[x, x] for x in self.spinerefs] for te in self.tocentries: idx = self.spinerefs.index(te[1]) spinerefs2[idx] = te self.tocentries = spinerefs2 # if not self.tocentries: # # Alternative toc # self.tocentries = [[item.attrib['idref'], #self.manifest_dict[item.attrib['idref']][len(self.basepath):]] #for item in self.itemrefs] print((self.tocentries)) print((self.spinerefs)) def getData(self, path): """Return the contents of a file in the document""" path = "%s%s" % (self.basepath, path) try: f = self.book.open(path) except KeyError: # File missing in the zip return [] data = f.read() f.close() return data