def update(self, amount = 0): """Look for new blog posts. Argument amount -- if specified, amount blog posts will be harvested """ #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default' if(amount > 0): url = self._url + r'?redirect=false&max-results=' + amount else: url = self._url try: blogobj = urllib.urlopen(url) except IOError: print('Error: could not read url ' + url) return try: rssdoc = parse(blogobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: blogobj.close() nodes = rssdoc.getElementsByTagName(el_entry) newestId = getTextNodeValue(nodes.item(0), el_published) if(self.newestId != newestId): for i in nodes: id = getTextNodeValue(i, el_published) if(id == self.newestId): break try: bpd = BlogPostData(i) bpi = self._itemtype(self.itemarg, blogpostdata = bpd) except Exception as e: print('BlogspotHarvester: Could not create BlogspotItem; ' + str(e)) continue done = self._addandcheckfunc(bpi) if(done): print(self.id + ': I don''t need to read more blog posts\n') break self.newestId = newestId else: print('There are no new items; newest item @ ' + self.newestId)
def update(self): """Look for new blog posts.""" #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default' try: blogobj = urllib.urlopen(self._url) except IOError: print('Error: could not read url ' + self._url) return try: rssdoc = parse(blogobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: blogobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if (nodes.length <= 0): raise Exception('No channel found in rss feed') chnode = nodes[0] nodes = chnode.getElementsByTagName(el_rssitem) newestId = getTextNodeValue(nodes.item(0), el_pubdate) if (self.newestId != newestId): for i in nodes: id = getTextNodeValue(i, el_pubdate) if (id == self.newestId): break try: item = WordPressItem(self.itemarg, rssxml=i) except: print('WordpressHarvester: Could not create WordPressItem') continue done = self._addandcheckfunc(item) if (done): #print(self.id + ': I don''t need to read more blog posts\n') break self.newestId = newestId else: print('There are no new items; newest item @ ' + self.newestId)
def update(self): """Look for new blog posts.""" #url = r'http://' + self._blogname + '.blogspot.com/feeds/posts/default' try: blogobj = urllib.urlopen(self._url) except IOError: print('Error: could not read url ' + self._url) return try: rssdoc = parse(blogobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: blogobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if(nodes.length <= 0): raise Exception('No channel found in rss feed') chnode = nodes[0] nodes = chnode.getElementsByTagName(el_rssitem) newestId = getTextNodeValue(nodes.item(0), el_pubdate) if(self.newestId != newestId): for i in nodes: id = getTextNodeValue(i, el_pubdate) if(id == self.newestId): break try: item = WordPressItem(self.itemarg, rssxml = i) except: print('WordpressHarvester: Could not create WordPressItem') continue done = self._addandcheckfunc(item) if(done): #print(self.id + ': I don''t need to read more blog posts\n') break self.newestId = newestId else: print('There are no new items; newest item @ ' + self.newestId)
def _tryGetBookInfo(self, isbn, library): url = self._searchprefix + isbn + self._searchsuffix try: rssobj = urllib.urlopen(url) except IOError: print('Error: Could not read url ' + url) return False try: rssdoc = parse(rssobj) except xml.dom.DOMException: print('Error: Could not read RSS') return False finally: rssobj.close() nodes = rssdoc.getElementsByTagName(el_item) if(nodes.length < 0): return False for i in nodes: link = getTextNodeValue(i, el_link) try: data = harvestBookInfo(link, library) except Exception as e: print('BlogPostItem ' + self._rawtitle + ':\n ' + e.value) else: self._selectShelf(data.shelves) self.section = data.section return True return False
def __init__(self, entrynode): """Create BlogPostData Argument entrynode -- XML node containing the data that will be extracted """ self.subjects = [] self.id = getTextNodeValue(entrynode, el_id) self.title = getTextNodeValue(entrynode, el_title) self.content = getTextNodeValue(entrynode, el_content) categorynodes = entrynode.getElementsByTagName(el_category) for i in categorynodes: self.subjects.append(i.attributes[attr_term].value)
def _loadfromcache(self, xmlnode): """Load the Item from cache Argument xmlnode -- XML node describing this item """ self._rawtitle = xmlnode.attributes[attr_title].value self.uid = xmlnode.attributes[attr_uid].value self._rawtext = getTextNodeValue(xmlnode, el_rawtext)
Arguments dir -- the cache directory dims -- tuple containing normal width and height of the image smalldims -- tuple containing small width and height of the image cachexml -- if not None the WordPressItem will be loaded from this cache node rssxml -- if not None the WordPressItem will be loaded from this RSS XML element """ ImageItem.__init__(self, dims, smalldims) if (cachexml is not None): self._loadfromcache(cachexml, dir) self._setId() self._loadimage(dims, smalldims) elif (rssxml is not None): title = getTextNodeValue(rssxml, el_title) self._rawtitle = title self._rawtext = getCDataNodeValue(rssxml, el_rsscontent) self._setUid() self._setId() self._imagename = '' self._formattext() if (rssxml is not None): if (self._formattedtext is not None): imagesrc = self._formattedtext.getFirstImage() if (imagesrc is not None): #TBD This is not good imagesrc = imagesrc.split('?')[0]
Arguments dir -- the cache directory dims -- tuple containing normal width and height of the image smalldims -- tuple containing small width and height of the image cachexml -- if not None the WordPressItem will be loaded from this cache node rssxml -- if not None the WordPressItem will be loaded from this RSS XML element """ ImageItem.__init__(self, dims, smalldims) if(cachexml is not None): self._loadfromcache(cachexml, dir) self._setId() self._loadimage(dims, smalldims) elif(rssxml is not None): title = getTextNodeValue(rssxml, el_title) self._rawtitle = title self._rawtext = getCDataNodeValue(rssxml, el_rsscontent) self._setUid() self._setId() self._imagename = '' self._formattext() if(rssxml is not None): if(self._formattedtext is not None): imagesrc = self._formattedtext.getFirstImage() if(imagesrc is not None): #TBD This is not good imagesrc = imagesrc.split('?')[0]
def _readRssChannel(self, url): """Extract books from an OPAC RSS channel until no more books need to be extracted Argument url -- url to the RSS channel """ #Get rss data try: rssobj = urllib.urlopen(url) except IOError: print('Error: Could not read url ' + url) return try: rssdoc = parse(rssobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: rssobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if(nodes.length < 0): raise Exception('No channel found in rss feed') chnode = nodes[0] ctr = 0 newItems = [] nodes = chnode.getElementsByTagName(el_rssitem) if(nodes.length > 0): id = self._getId(nodes.item(0)) if(id == self.newestId): print(self.id + ': No more new items') else: newestId = id #Extract items from rss data for i in nodes: if(i.nodeType == i.ELEMENT_NODE): id = self._getId(i) #Check if the element is new if(id == self.newestId): print(self.id + ': No more new items') break #Check if the element comes from the correct library desc = getCDataNodeValue(i, el_rssdesc) library = self._getlibrary(desc) if(library == self._library): url = getTextNodeValue(i, el_link) try: item = self._itemclass(self.itemarg, url = url.strip()) except: print('VarbergOpacHarvester: Could not create OpacBookItem') continue done = self._addandcheckfunc(item) if(done): print(self.id + ': I don''t need to read more items\n') break self.newestId = newestId
def _readRssChannel(self, url): """Extract books from an OPAC RSS channel until no more books need to be extracted Argument url -- url to the RSS channel """ #Get rss data try: rssobj = urllib.urlopen(url) except IOError: print('Error: Could not read url ' + url) return try: rssdoc = parse(rssobj) except xml.dom.DOMException: print('Error: Could not read RSS') return finally: rssobj.close() nodes = rssdoc.getElementsByTagName(el_channel) if (nodes.length < 0): raise Exception('No channel found in rss feed') chnode = nodes[0] ctr = 0 newItems = [] nodes = chnode.getElementsByTagName(el_rssitem) if (nodes.length > 0): id = self._getId(nodes.item(0)) if (id == self.newestId): print(self.id + ': No more new items') else: newestId = id #Extract items from rss data for i in nodes: if (i.nodeType == i.ELEMENT_NODE): id = self._getId(i) #Check if the element is new if (id == self.newestId): print(self.id + ': No more new items') break #Check if the element comes from the correct library desc = getCDataNodeValue(i, el_rssdesc) library = self._getlibrary(desc) if (library == self._library): url = getTextNodeValue(i, el_link) try: item = self._itemclass(self.itemarg, url=url.strip()) except: print( 'VarbergOpacHarvester: Could not create OpacBookItem' ) continue done = self._addandcheckfunc(item) if (done): print(self.id + ': I don' 't need to read more items\n') break self.newestId = newestId