def __split_models(self, xmlDoc):
     """generator that takes parameter xmlDoc and splits it into many
     xml files, with only one model per each"""
     elem = XML(xmlDoc)
     models = elem.find("Models")
     if models:
         elem.remove(models)
         for model in models:    
             to_return = copy.deepcopy(elem)    
             new_models = Element("Models")
             for a in models.attrib:
                 new_models.attrib[a] = models.attrib[a]
             new_models.append(model)
             to_return.append(new_models)
             yield (model.attrib['id'], to_return)
     else:
         pass #TODO return error
def parsePodcast(podcastXML, config, filemode=None):
    """
    Access to the podcast and return all information in a podcast object
    
    return None when no info available, when error raise MyCancel
    """
    podcastInfo = PodcastInfo()
    podcastNode = XML(podcastXML)
    channelNode = podcastNode.find('channel')

    ### TODO handle when title is empty, use the program title
    titlePodcast = channelNode.findtext('title', '')
    titlePodcastAscii = titlePodcast.encode('ascii', 'ignore')

    if not titlePodcast:
        titlePodcast = 'UNDEFINED'
        titlePodcastAscii = 'UNDEFINED'
    else:
        titlePodcastAscii = getCroppedFilename(titlePodcastAscii)
        titlePodcastAscii = cleanString(titlePodcastAscii)

    ## TODO support the podcast named the same way... add a hashcode after the title, and a main podcast.xml file at the root

    # the target local directory
    targetDirectory = os.path.join(config.podcastDownloadPath,
                                   titlePodcastAscii)
    chandescription = channelNode.findtext('description', '')
    #chanImage         = getXMLAttrText(channeldom, 'itunes:image', 'href')

    ####
    podcastInfo.title = titlePodcast
    podcastInfo.description = chandescription
    #podcastInfo.image = chanImage
    podcastInfo.titleAscii = titlePodcastAscii
    podcastInfo.targetDirectory = targetDirectory

    # Get the local info, and a link on the elemnt node, to be modified later
    podcastLocalInfo = getPodcastLocalItems(podcastInfo)

    # parse the item list
    #items = channeldom.getElementsByTagName('item')
    itemExist = channelNode.find('item') != None

    if not itemExist:
        # return empty mark
        return None

    #for item in items:
    for itemNode in channelNode.getiterator('item'):
        descr = ''
        type = ''
        length = ''

        title = itemNode.findtext('title', '')
        descr = itemNode.findtext('description', '')
        #pubDate    = getXMLTagText(item, 'pubDate')
        #duration   = getXMLTagText(item, 'itunes:duration')

        enclosureNode = itemNode.find('enclosure')
        if enclosureNode == None:
            continue

        # the url can be redirect, urllib follow this link for downlaod
        url = enclosureNode.get('url')
        if url == None:
            # when no url, continue
            continue

        # search in the local if the file is already here, already downloaded, and add it
        foundLocalItem = None
        for podlocalitem in podcastLocalInfo.itemsInfo:
            if podlocalitem.url == url:
                foundLocalItem = podlocalitem
        if foundLocalItem != None:
            podcastInfo.itemsInfo.append(foundLocalItem)
            continue

        # type is not always defined?!, can test with urllib when downloaded
        type = enclosureNode.get('type')
        # Length only used for information as list, exact size is found later during download
        length = enclosureNode.get('length')

        #####################
        podItem = PodcastItem()
        podItem.title = title
        podItem.description = descr
        podItem.url = url
        podItem.type = type
        podItem.length = length
        podcastInfo.itemsInfo.append(podItem)

    # TODO ?? return when ! config.podcastDownload

    # init the titles and the filename, filelocation
    for podItem in podcastInfo.itemsInfo:

        # the display title
        title = podItem.title

        # when is local, don't process filenames and add >> in the titles
        if podItem.isLocal:
            # control that the file exist and fully downloaded, when not try to download ...
            if not os.path.exists(podItem.fileLocation):
                podItem.isLocal = False
                podItem.flagfinish = False

            elif podItem.flagfinish:
                title = '>> ' + title
            else:
                title = '<> ' + title

        # when not local, or not file found
        else:

            # TODO test that valid name, no special char, len > sufficient, and not always the same name,
            # TODO OR add something in the MEDIA xml file

            ## TODO when a podcast title is many time the same, use a hascode after the title filaname

            ### PROBELM avec la limite des path: lorsque fichier trop long, peut avec des equivalent......

            #  podcastInfo.useTitleForName or
            if filemode == "title":
                filename = podItem.title
                podcastInfo.useTitleForName
            else:
                filename = getLastStringPart(
                    podItem.url.encode('ascii', 'ignore'), '/')
                filename = getBeforeStringPart(filename, '?')

                ### TODO test if this filename already exit in the list..?? -> if YES MARK it using a poditem flag, and use title...

            filename = getCroppedFilename(filename)  # 42-4
            filename = cleanString(filename)

            fileLocation = targetDirectory + '\\' + filename

            # set the properties
            podItem.filename = filename
            podItem.fileLocation = fileLocation

            # When the file exist but no entry in the xml add it here
            if os.path.exists(podItem.fileLocation):
                title = '>? ' + title

        # process size
        size = 0
        if podItem.size != 0:
            size = podItem.size
        elif podItem.length and len(podItem.length) > 0:
            size = round(long(podItem.length) / 1000000.0, 1)
        # set size and title
        if size != 0:
            podItem.size = size
            title = title + ' (' + str(size) + 'Mo)'

        podcastInfo.itemFilenames.append(podItem.filename)
        podcastInfo.titles2display.append(title)

    # search in the folder if already downloaded file are available, and not refferenced in the xml
    appendLocalMedia(podcastInfo, podcastLocalInfo)

    # return the podcast info with items
    return podcastInfo
def parsePodcast(podcastXML, config, filemode=None):
    """
    Access to the podcast and return all information in a podcast object
    
    return None when no info available, when error raise MyCancel
    """
    podcastInfo = PodcastInfo()                    
    podcastNode = XML(podcastXML)
    channelNode = podcastNode.find('channel')
    
    ### TODO handle when title is empty, use the program title
    titlePodcast = channelNode.findtext('title', '')
    titlePodcastAscii = titlePodcast.encode('ascii', 'ignore')

    if not titlePodcast:
        titlePodcast = 'UNDEFINED'
        titlePodcastAscii = 'UNDEFINED'
    else:
        titlePodcastAscii = getCroppedFilename(titlePodcastAscii)
        titlePodcastAscii = cleanString(titlePodcastAscii)
    
    ## TODO support the podcast named the same way... add a hashcode after the title, and a main podcast.xml file at the root
    
    # the target local directory
    targetDirectory = os.path.join(config.podcastDownloadPath, titlePodcastAscii)
    chandescription   = channelNode.findtext('description', '')
    #chanImage         = getXMLAttrText(channeldom, 'itunes:image', 'href')

    ####
    podcastInfo.title = titlePodcast
    podcastInfo.description = chandescription
    #podcastInfo.image = chanImage    
    podcastInfo.titleAscii = titlePodcastAscii
    podcastInfo.targetDirectory = targetDirectory    
    
    # Get the local info, and a link on the elemnt node, to be modified later
    podcastLocalInfo = getPodcastLocalItems(podcastInfo)
    
    # parse the item list
    #items = channeldom.getElementsByTagName('item') 
    itemExist = channelNode.find('item') != None
                        
    if not itemExist:
        # return empty mark
        return None
    
    #for item in items:
    for itemNode in channelNode.getiterator('item'):
        descr = ''
        type = ''
        length = ''
        
        title = itemNode.findtext('title', '')
        descr      = itemNode.findtext('description', '')                                      
        #pubDate    = getXMLTagText(item, 'pubDate')
        #duration   = getXMLTagText(item, 'itunes:duration')     

        enclosureNode = itemNode.find('enclosure')
        if enclosureNode == None:
            continue
        
        # the url can be redirect, urllib follow this link for downlaod                             
        url = enclosureNode.get('url')
        if url == None:
            # when no url, continue
            continue 
        
        # search in the local if the file is already here, already downloaded, and add it
        foundLocalItem = None        
        for podlocalitem in podcastLocalInfo.itemsInfo:
            if podlocalitem.url == url:
                foundLocalItem = podlocalitem
        if foundLocalItem != None:            
            podcastInfo.itemsInfo.append(foundLocalItem)    
            continue
        
        # type is not always defined?!, can test with urllib when downloaded
        type       = enclosureNode.get('type')
        # Length only used for information as list, exact size is found later during download
        length       = enclosureNode.get('length')
                                                                
        #####################
        podItem = PodcastItem()
        podItem.title = title
        podItem.description = descr
        podItem.url = url
        podItem.type = type
        podItem.length = length
        podcastInfo.itemsInfo.append(podItem)    


    # TODO ?? return when ! config.podcastDownload

    # init the titles and the filename, filelocation
    for podItem in podcastInfo.itemsInfo :
        
        # the display title 
        title = podItem.title
        
        # when is local, don't process filenames and add >> in the titles
        if podItem.isLocal:
            # control that the file exist and fully downloaded, when not try to download ...
            if not os.path.exists(podItem.fileLocation) : 
                podItem.isLocal = False
                podItem.flagfinish = False
                
            elif podItem.flagfinish:
                title = '>> ' + title                  
            else: 
                title = '<> ' + title              
        
        # when not local, or not file found
        else :
            
            # TODO test that valid name, no special char, len > sufficient, and not always the same name, 
            # TODO OR add something in the MEDIA xml file
            
            ## TODO when a podcast title is many time the same, use a hascode after the title filaname

            
            ### PROBELM avec la limite des path: lorsque fichier trop long, peut avec des equivalent......
            
            #  podcastInfo.useTitleForName or
            if  filemode == "title":
                filename = podItem.title
                podcastInfo.useTitleForName
            else:
                filename = getLastStringPart(podItem.url.encode('ascii', 'ignore'), '/')    
                filename = getBeforeStringPart(filename, '?')
                
                ### TODO test if this filename already exit in the list..?? -> if YES MARK it using a poditem flag, and use title...
                
                
            filename = getCroppedFilename(filename) # 42-4   
            filename = cleanString(filename)
            
            fileLocation = targetDirectory + '\\' + filename
            
            # set the properties
            podItem.filename = filename
            podItem.fileLocation = fileLocation
            
            # When the file exist but no entry in the xml add it here 
            if os.path.exists(podItem.fileLocation) : 
                title = '>? ' + title   
            
        # process size
        size = 0
        if podItem.size != 0 :
            size = podItem.size
        elif podItem.length and len(podItem.length) > 0:
            size = round(long(podItem.length) / 1000000.0 , 1)     
        # set size and title
        if size != 0:
            podItem.size = size
            title = title + ' (' + str(size) + 'Mo)'
        
        podcastInfo.itemFilenames.append(podItem.filename)
        podcastInfo.titles2display.append(title)
    
    
    # search in the folder if already downloaded file are available, and not refferenced in the xml
    appendLocalMedia(podcastInfo, podcastLocalInfo)
    
    # return the podcast info with items
    return podcastInfo
Exemple #4
0
class EpubDocument(object):
    """A class that parses and provides
    data about an ePub file"""

    def __init__(self, fname):
        # This is done according to this:
        # http://stackoverflow.com/questions/1388467/reading-epub-format

        print(("Opening:", fname))
        try:
            self.book = zipfile.ZipFile(fname, "r")
        except zipfile.BadZipfile:
            raise ValueError("Invalid format")

        f = self.book.open('META-INF/container.xml')
        self.container = XML(f.read())
        f.close()
        roots = self.container.findall(
                './/{urn:oasis:names:tc:opendocument:xmlns:container}rootfile')
        self.roots = []
        for r in roots:
            self.roots.append(r.attrib['full-path'])
        opf = self.book.open(self.roots[0])
        self.basepath = os.path.dirname(self.roots[0]) + "/"
        if self.basepath == '/':
            self.basepath = ""
        print(("BASEPATH:", self.basepath))

        data = opf.read()
        self.opf = XML(data)
        opf.close()
        self.manifest = self.opf.find('{http://www.idpf.org/2007/opf}manifest')
        self.manifest_dict = {}
        for elem in self.manifest.findall(
                            '{http://www.idpf.org/2007/opf}item'):
            self.manifest_dict[elem.attrib['id']] = self.basepath + \
                                                    elem.attrib['href']

        self.spine = self.opf.find('{http://www.idpf.org/2007/opf}spine')

        self.tocentries = []
        self.toc_id = self.spine.attrib.get('toc', None)
        if self.toc_id:
            self.toc_fn = self.manifest_dict[self.toc_id]
            print(("TOC:", self.toc_fn))
            f = self.book.open(self.toc_fn)
            data = f.read()
            self.toc = XML(data)
            self.navmap = self.toc.find(
                            '{http://www.daisy.org/z3986/2005/ncx/}navMap')
            # FIXME: support nested navpoints
            self.navpoints = self.navmap.findall(
                        './/{http://www.daisy.org/z3986/2005/ncx/}navPoint')
            for np in self.navpoints:
                label = np.find(
                    '{http://www.daisy.org/z3986/2005/ncx/}navLabel').find(
                            '{http://www.daisy.org/z3986/2005/ncx/}text').text
                content = np.find(
                 '{http://www.daisy.org/z3986/2005/ncx/}content').attrib['src']
                if label and content:
                    self.tocentries.append([label, content])

        self.itemrefs = self.spine.findall(
                                    '{http://www.idpf.org/2007/opf}itemref')
        print(("IR:", self.itemrefs))
        self.spinerefs = [
            self.manifest_dict[item.attrib['idref']][len(self.basepath):]
                                                    for item in self.itemrefs]
        # I found one book that has a spine but no navmap:
        # "Der schwarze Baal" from manybooks.net
        # Also another has more entries on the spine than on the navmap
        # (Dinosauria, from feedbooks).
        # So, we need to merge these suckers. I will assume it's not completely
        # insane and the spine is always more complete.

        spinerefs2 = [[x, x] for x in self.spinerefs]

        for te in self.tocentries:
            idx = self.spinerefs.index(te[1])
            spinerefs2[idx] = te

        self.tocentries = spinerefs2
        # if not self.tocentries:
            # # Alternative toc
            # self.tocentries = [[item.attrib['idref'],
             #self.manifest_dict[item.attrib['idref']][len(self.basepath):]]
                                                    #for item in self.itemrefs]

        print((self.tocentries))
        print((self.spinerefs))

    def getData(self, path):
        """Return the contents of a file in the document"""

        path = "%s%s" % (self.basepath, path)
        try:
            f = self.book.open(path)
        except KeyError:  # File missing in the zip
            return []
        data = f.read()
        f.close()
        return data