コード例 #1
0
ファイル: hippo.py プロジェクト: 9b/bookworm
    def process_feed(self):

        for f in self._feed_list:
            self._link = f
            self._fburned = feedparser.parse(self._link)

            # grab the details from the burned feed
            self._furl = self._fburned['url']
            self._fversion = self._fburned['version']
            self._flang = ""

            self._log.info(
                "Processing articles for %s: %s, %s" %
                (self._furl, str(self._fversion), self._flang.strip()))

            self._articles = []

            for i in self._fburned['items']:
                self._aframe = {
                    'title': None,
                    'date': None,
                    'link': None,
                    'keywords': [],
                    'feed': None,
                    'language': None
                }

                try:
                    self._ititle = i['title']
                    #self._isummary = i['summary']
                    self._idate = i['published']
                    self._ilink = i['link']
                    self._ctext = stripper(
                        i['summary']).get_data()  #strip out HTML
                    self._sum = summarize(
                        self._ctext, self._kword_amt)  #summarize the article
                    self._kwords = self._sum.get_most_used_words()
                except Exception, e:
                    print str(e)
                    self._log.error("Failed to process %s" % i['title'])

                self._log.debug(
                    "Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s"
                    % (self._ititle, self._idate, self._ilink, self._kwords))

                self._aframe['title'] = self._ititle
                self._aframe['date'] = self._idate
                self._aframe['link'] = self._ilink
                self._aframe['keywords'] = self._kwords
                self._aframe['feed'] = self._furl
                self._aframe['language'] = self._flang
                hashed = hashlib.sha256(str(self._aframe)).hexdigest(
                )  #hash the contents to check in DB
                self._aframe['hashed'] = hashed
                self._articles.append(self._aframe)

                if self._mongodb_handle._not_processed(hashed):
                    self._log.info("Adding %s (%s)" % (self._ititle, hashed))
                    self._mongodb_handle._insert_full(self._aframe)
コード例 #2
0
ファイル: hippo.py プロジェクト: 9b/bookworm
	def process_feed(self):

		for f in self._feed_list:
			self._link = f
			self._fburned = feedparser.parse(self._link)

			# grab the details from the burned feed
			self._furl = self._fburned['url']
			self._fversion = self._fburned['version']
			self._flang = ""

			self._log.info("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) )

			self._articles = []

			for i in self._fburned['items']:
				self._aframe = {'title':None,'date':None,'link':None,'keywords':[],'feed':None,'language':None}
				
				try:
					self._ititle = i['title']
					#self._isummary = i['summary']
					self._idate = i['published']
					self._ilink = i['link']								
					self._ctext = stripper(i['summary']).get_data() #strip out HTML
					self._sum = summarize(self._ctext,self._kword_amt) #summarize the article
					self._kwords = self._sum.get_most_used_words()
				except Exception,e:
					print str(e)
					self._log.error("Failed to process %s" % i['title'])	
			
				self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords))
				
				self._aframe['title'] = self._ititle
				self._aframe['date'] = self._idate
				self._aframe['link'] = self._ilink
				self._aframe['keywords'] = self._kwords
				self._aframe['feed'] = self._furl
				self._aframe['language'] = self._flang
				hashed = hashlib.sha256(str(self._aframe)).hexdigest() #hash the contents to check in DB
				self._aframe['hashed'] = hashed
				self._articles.append(self._aframe)

				if self._mongodb_handle._not_processed(hashed):
					self._log.info("Adding %s (%s)" % (self._ititle,hashed))
					self._mongodb_handle._insert_full(self._aframe)
コード例 #3
0
ファイル: embedded_hippo.py プロジェクト: 9b/bookworm
	def process_feed(self):

		for f in self._feed_list:
			self._frame = {'feed':None,'version':None,'language':None,'articles':[]}
			self._link,self._lang = f.split(",")
			self._lang = self._lang.strip()
			self._fburned = feedparser.parse(self._link)

			# grab the details from the burned feed
			self._furl = self._fburned['url']
			self._fversion = self._fburned['version']
			self._flang = self._lang

			self._log.debug("Processing articles for %s: %s, %s" % (self._furl,str(self._fversion),self._flang.strip()) )

			self._articles = []

			for i in self._fburned['items']:
				self._aframe = {'title':None,'date':None,'link':None,'keywords':[]}
				
				try:
					self._ititle = i['title']
					#self._isummary = i['summary']
					self._idate = i['published']
					self._ilink = i['link']								
					self._ctext = stripper(i['summary']).get_data() #strip out HTML
					self._sum = summarize(self._ctext,self._kword_amt) #summarize the article
					self._kwords = self._sum.get_most_used_words()
				except Exception,e:
					self._log.error("Failed to process %s" % i['title'])	
			
				self._log.debug("Processed article: \ntitle:%s \ndate:%s \nlink:%s \nkeywords:%s" % (self._ititle,self._idate,self._ilink,self._kwords))
				
				self._aframe['title'] = self._ititle
				self._aframe['date'] = self._idate
				self._aframe['link'] = self._ilink
				self._aframe['keywords'] = self._kwords
				self._articles.append(self._aframe)

			self._frame['feed'] = self._furl
			self._frame['version'] = self._fversion
			self._frame['language'] = self._flang
			self._frame['articles'] = self._articles