Ejemplo n.º 1
0
        dims -- tuple containing normal width and height of the image
        smalldims -- tuple containing small width and height of the image
        cachexml -- if not None the WordPressItem will be loaded from this cache node
        rssxml -- if not None the WordPressItem will be loaded from this RSS XML element
        
        """
        ImageItem.__init__(self, dims, smalldims)

        if (cachexml is not None):
            self._loadfromcache(cachexml, dir)
            self._setId()
            self._loadimage(dims, smalldims)
        elif (rssxml is not None):
            title = getTextNodeValue(rssxml, el_title)
            self._rawtitle = title
            self._rawtext = getCDataNodeValue(rssxml, el_rsscontent)
            self._setUid()
            self._setId()
            self._imagename = ''

        self._formattext()

        if (rssxml is not None):
            if (self._formattedtext is not None):
                imagesrc = self._formattedtext.getFirstImage()

                if (imagesrc is not None):
                    #TBD This is not good
                    imagesrc = imagesrc.split('?')[0]
                    self._imgsrcurl = imagesrc
                    filename = textToFilename(
Ejemplo n.º 2
0
    def _readRssChannel(self, url):
        """Extract books from an OPAC RSS channel until no more books need 
        to be extracted
        
        Argument
        url -- url to the RSS channel
        
        """
        #Get rss data
        try:
            rssobj = urllib.urlopen(url)
        except IOError:
            print('Error: Could not read url ' + url)
            return

        try:
            rssdoc = parse(rssobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            rssobj.close()
        
        nodes = rssdoc.getElementsByTagName(el_channel)
        
        if(nodes.length < 0):
            raise Exception('No channel found in rss feed')
        
        chnode = nodes[0]
        ctr = 0
        newItems = []
        nodes = chnode.getElementsByTagName(el_rssitem)

        if(nodes.length > 0):
            id = self._getId(nodes.item(0))
            
            if(id == self.newestId):
                print(self.id + ': No more new items')
            else:
                newestId = id

                #Extract items from rss data
                for i in nodes:
                    if(i.nodeType == i.ELEMENT_NODE):
                        id = self._getId(i)
                        
                        #Check if the element is new
                        if(id == self.newestId):
                            print(self.id + ': No more new items')
                            break
                        
                        #Check if the element comes from the correct library
                        desc = getCDataNodeValue(i, el_rssdesc)
                        library = self._getlibrary(desc)
                        
                        if(library == self._library):
                            url = getTextNodeValue(i, el_link)
                            
                            try:
                                item = self._itemclass(self.itemarg, url = url.strip())
                            except:
                                print('VarbergOpacHarvester: Could not create OpacBookItem')
                                continue

                            done = self._addandcheckfunc(item)
            
                            if(done):
                                print(self.id + ': I don''t need to read more items\n')
                                break

                self.newestId = newestId
Ejemplo n.º 3
0
        dims -- tuple containing normal width and height of the image
        smalldims -- tuple containing small width and height of the image
        cachexml -- if not None the WordPressItem will be loaded from this cache node
        rssxml -- if not None the WordPressItem will be loaded from this RSS XML element
        
        """
        ImageItem.__init__(self, dims, smalldims)

        if(cachexml is not None):
            self._loadfromcache(cachexml, dir)
            self._setId()
            self._loadimage(dims, smalldims)
        elif(rssxml is not None):
            title = getTextNodeValue(rssxml, el_title)
            self._rawtitle = title
            self._rawtext = getCDataNodeValue(rssxml, el_rsscontent)
            self._setUid()
            self._setId()
            self._imagename = ''
            
        self._formattext()
        
        if(rssxml is not None):
            if(self._formattedtext is not None):
                imagesrc = self._formattedtext.getFirstImage()
                
                if(imagesrc is not None):
                    #TBD This is not good
                    imagesrc = imagesrc.split('?')[0]
                    self._imgsrcurl = imagesrc
                    filename = textToFilename(self.uid) + os.path.split(imagesrc)[1]
Ejemplo n.º 4
0
    def _readRssChannel(self, url):
        """Extract books from an OPAC RSS channel until no more books need 
        to be extracted
        
        Argument
        url -- url to the RSS channel
        
        """
        #Get rss data
        try:
            rssobj = urllib.urlopen(url)
        except IOError:
            print('Error: Could not read url ' + url)
            return

        try:
            rssdoc = parse(rssobj)
        except xml.dom.DOMException:
            print('Error: Could not read RSS')
            return
        finally:
            rssobj.close()

        nodes = rssdoc.getElementsByTagName(el_channel)

        if (nodes.length < 0):
            raise Exception('No channel found in rss feed')

        chnode = nodes[0]
        ctr = 0
        newItems = []
        nodes = chnode.getElementsByTagName(el_rssitem)

        if (nodes.length > 0):
            id = self._getId(nodes.item(0))

            if (id == self.newestId):
                print(self.id + ': No more new items')
            else:
                newestId = id

                #Extract items from rss data
                for i in nodes:
                    if (i.nodeType == i.ELEMENT_NODE):
                        id = self._getId(i)

                        #Check if the element is new
                        if (id == self.newestId):
                            print(self.id + ': No more new items')
                            break

                        #Check if the element comes from the correct library
                        desc = getCDataNodeValue(i, el_rssdesc)
                        library = self._getlibrary(desc)

                        if (library == self._library):
                            url = getTextNodeValue(i, el_link)

                            try:
                                item = self._itemclass(self.itemarg,
                                                       url=url.strip())
                            except:
                                print(
                                    'VarbergOpacHarvester: Could not create OpacBookItem'
                                )
                                continue

                            done = self._addandcheckfunc(item)

                            if (done):
                                print(self.id + ': I don'
                                      't need to read more items\n')
                                break

                self.newestId = newestId