dims -- tuple containing normal width and height of the image smalldims -- tuple containing small width and height of the image cachexml -- if not None the WordPressItem will be loaded from this cache node rssxml -- if not None the WordPressItem will be loaded from this RSS XML element """ ImageItem.__init__(self, dims, smalldims) if (cachexml is not None): self._loadfromcache(cachexml, dir) self._setId() self._loadimage(dims, smalldims) elif (rssxml is not None): title = getTextNodeValue(rssxml, el_title) self._rawtitle = title self._rawtext = getCDataNodeValue(rssxml, el_rsscontent) self._setUid() self._setId() self._imagename = '' self._formattext() if (rssxml is not None): if (self._formattedtext is not None): imagesrc = self._formattedtext.getFirstImage() if (imagesrc is not None): #TBD This is not good imagesrc = imagesrc.split('?')[0] self._imgsrcurl = imagesrc filename = textToFilename(
def _readRssChannel(self, url): """Extract books from an OPAC RSS channel until no more books need to be extracted Argument url -- url to the RSS channel """ #Get rss data try: rssobj = urllib.urlopen(url) except IOError: print('Error: Could not read url ' + url) return try: rssdoc = parse(rssobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: rssobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if(nodes.length < 0): raise Exception('No channel found in rss feed') chnode = nodes[0] ctr = 0 newItems = [] nodes = chnode.getElementsByTagName(el_rssitem) if(nodes.length > 0): id = self._getId(nodes.item(0)) if(id == self.newestId): print(self.id + ': No more new items') else: newestId = id #Extract items from rss data for i in nodes: if(i.nodeType == i.ELEMENT_NODE): id = self._getId(i) #Check if the element is new if(id == self.newestId): print(self.id + ': No more new items') break #Check if the element comes from the correct library desc = getCDataNodeValue(i, el_rssdesc) library = self._getlibrary(desc) if(library == self._library): url = getTextNodeValue(i, el_link) try: item = self._itemclass(self.itemarg, url = url.strip()) except: print('VarbergOpacHarvester: Could not create OpacBookItem') continue done = self._addandcheckfunc(item) if(done): print(self.id + ': I don''t need to read more items\n') break self.newestId = newestId
dims -- tuple containing normal width and height of the image smalldims -- tuple containing small width and height of the image cachexml -- if not None the WordPressItem will be loaded from this cache node rssxml -- if not None the WordPressItem will be loaded from this RSS XML element """ ImageItem.__init__(self, dims, smalldims) if(cachexml is not None): self._loadfromcache(cachexml, dir) self._setId() self._loadimage(dims, smalldims) elif(rssxml is not None): title = getTextNodeValue(rssxml, el_title) self._rawtitle = title self._rawtext = getCDataNodeValue(rssxml, el_rsscontent) self._setUid() self._setId() self._imagename = '' self._formattext() if(rssxml is not None): if(self._formattedtext is not None): imagesrc = self._formattedtext.getFirstImage() if(imagesrc is not None): #TBD This is not good imagesrc = imagesrc.split('?')[0] self._imgsrcurl = imagesrc filename = textToFilename(self.uid) + os.path.split(imagesrc)[1]
def _readRssChannel(self, url): """Extract books from an OPAC RSS channel until no more books need to be extracted Argument url -- url to the RSS channel """ #Get rss data try: rssobj = urllib.urlopen(url) except IOError: print('Error: Could not read url ' + url) return try: rssdoc = parse(rssobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: rssobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if (nodes.length < 0): raise Exception('No channel found in rss feed') chnode = nodes[0] ctr = 0 newItems = [] nodes = chnode.getElementsByTagName(el_rssitem) if (nodes.length > 0): id = self._getId(nodes.item(0)) if (id == self.newestId): print(self.id + ': No more new items') else: newestId = id #Extract items from rss data for i in nodes: if (i.nodeType == i.ELEMENT_NODE): id = self._getId(i) #Check if the element is new if (id == self.newestId): print(self.id + ': No more new items') break #Check if the element comes from the correct library desc = getCDataNodeValue(i, el_rssdesc) library = self._getlibrary(desc) if (library == self._library): url = getTextNodeValue(i, el_link) try: item = self._itemclass(self.itemarg, url=url.strip()) except: print( 'VarbergOpacHarvester: Could not create OpacBookItem' ) continue done = self._addandcheckfunc(item) if (done): print(self.id + ': I don' 't need to read more items\n') break self.newestId = newestId