Esempio n. 1
0
    def spide(self):
        """
		The multi-threaded version of crawl
		"""
        future_calls = [
            Future(feedparser.parse, rss_url) for rss_url in self.channels
        ]
        # block until they are all in
        feeds = [future_obj() for future_obj in future_calls]
        entries = []
        for feed in feeds:
            entries.extend(feed["items"])
        # sorted_entries = sorted(entries, key=lambda entry: entry["date_parsed"])
        # sorted_entries.reverse() # for most recent entries first
        # for item in sorted_entries:
        minutes = 0
        for item in entries:
            minutes += self.saveFeed(item)
        crawlogger.info("NewLy Fetched %d Feeds between %d entries" %
                        (minutes, len(entries)))
        if len(entries):
            newly = float(minutes) / float(len(entries))
            self.minutes = 5 * (1 - newly)
        crawlogger.info("%s : Gonnoa have a %d minutes SNAP" %
                        (datetime.now(), self.minutes))
        return self.minutes
Esempio n. 2
0
	def creep(self, spider):
		"""
		"""
		# We use APScheduler for some crontab style jobs
		# We need a timer, to schedule the spider for auto crawling.
		# Other hand, we also must have the ability to manually crawling.
		self.minutes = 5
		while self.hush:
			self.minutes = self.spide()
			self.updateChannelList()
			time.sleep(self.minutes*60)
			crawlogger.info("%d MINUTES, wake up and crawl ....\n" %(self.minutes))
Esempio n. 3
0
 def crawl(self):
     feed = feedparser.parse(self.channels[0])
     items = feed["items"]
     crawlogger.info("items : %d" % (len(items)))
     # print items[0]
     minutes = 0
     for x in xrange(0, len(items)):
         item0 = items[x]
         minutes += self.saveFeed(item0)
     if len(items):
         newly = float(minutes) / float(len(entries))
         # self.minutes =5*(1-newly)
     return self.minutes
Esempio n. 4
0
    def creep(self, spider):
        """
		"""
        # We use APScheduler for some crontab style jobs
        # We need a timer, to schedule the spider for auto crawling.
        # Other hand, we also must have the ability to manually crawling.
        self.minutes = 5
        while self.hush:
            self.minutes = self.spide()
            self.updateChannelList()
            time.sleep(self.minutes * 60)
            crawlogger.info("%d MINUTES, wake up and crawl ....\n" %
                            (self.minutes))
Esempio n. 5
0
	def crawl(self):
		feed = feedparser.parse(self.channels[0])
		items = feed["items"]
		crawlogger.info("items : %d" %(len(items)))
		# print items[0]
		minutes = 0
		for x in xrange(0,len(items)):
			item0 = items[x]
			minutes += self.saveFeed(item0)
		if len(items):
			newly = float(minutes)/float(len(entries))
			# self.minutes =5*(1-newly)
		return self.minutes
Esempio n. 6
0
 def updateChannelList(self):
     d = {}
     a = []
     # a.extend(google_news_rss,smzdm_rss)
     pathname = self.channels_filename
     mtime = os.stat(pathname).st_mtime
     if self.channels_mtime != mtime:
         self.channels_mtime = mtime
         with open(pathname) as f:
             # i = 1
             for line in f:
                 (key, val) = line.split("==")
                 print "channel lines :%s:%s" % (key, val)
                 d[key] = val
                 a.append(val.lstrip())
         self.channels = a
         crawlogger.info("Fetched from FS %s" % (a))
Esempio n. 7
0
	def updateChannelList(self):
		d = {}
		a = []
		# a.extend(google_news_rss,smzdm_rss)
		pathname = self.channels_filename
		mtime = os.stat(pathname).st_mtime
		if self.channels_mtime!=mtime:
			self.channels_mtime = mtime
			with open(pathname) as f:
				# i = 1
			    for line in f:
			       	(key, val) = line.split("==")			       	
			       	print "channel lines :%s:%s" %(key, val)
			       	d[key] = val
			       	a.append(val.lstrip())
			self.channels = a
			crawlogger.info("Fetched from FS %s" %(a))
Esempio n. 8
0
 def saveFeed(self, item0):
     # print "SUMMARY: %s\n" %(item0["summary"])
     # print "HISTORY : %s" %(item0["wiki_history"])
     # print "DIFF : %s" %(item0["wiki_diff"])		pass
     #Save it somehow.
     feedstore = Newsfeed()
     feedstore.guid = "xxx0jx0jx000"
     feedstore.title = item0["title"]
     feedstore.link = item0["link"]
     feedstore.descript = item0["description"]
     feedstore.editStatus = 0
     feedstore.editorId = 0
     # feedstore.source = ""
     # feedstore.rssid = ""
     # feedstore.editorMemo = ""
     # repeated feeds
     repeated = newsfeeds.find_one({
         "title": item0["title"],
         "link": item0["link"]
     })
     # print "THere existed some like %s of the same guid" %(repeated)
     if repeated:
         crawlogger.debug("XXXXXXXXXX Existed GUID:%s" % ("xxxx"))
         return 0
     else:
         _id = feedstore.save()
         crawlogger.info("OOOOOOOOOO NewLy!!!!!!!: %s\n" % (_id))
         crawlogger.info("TITLE: %s" % (item0["title"]))
         crawlogger.info("LINK : %s" % (item0["link"]))
         # print "DESCP: %s\n" %(item0["description"])
         # print "GUID: %s"  %(item0["link"])
         return 1
Esempio n. 9
0
	def saveFeed(self,item0):
		# print "SUMMARY: %s\n" %(item0["summary"])
		# print "HISTORY : %s" %(item0["wiki_history"])
		# print "DIFF : %s" %(item0["wiki_diff"])		pass
		#Save it somehow.
		feedstore = Newsfeed()
		feedstore.guid = "xxx0jx0jx000"
		feedstore.title = item0["title"]
		feedstore.link = item0["link"]
		feedstore.descript = item0["description"]
		feedstore.editStatus = 0;
		feedstore.editorId = 0;	
		# feedstore.source = ""
		# feedstore.rssid = ""
		# feedstore.editorMemo = ""
		# repeated feeds
		repeated = newsfeeds.find_one({"title":item0["title"], "link":item0["link"]})
		# print "THere existed some like %s of the same guid" %(repeated)
		if repeated:
			crawlogger.debug("XXXXXXXXXX Existed GUID:%s" %("xxxx"))
			return 0;
		else :
			_id = feedstore.save()
			crawlogger.info("OOOOOOOOOO NewLy!!!!!!!: %s\n" %(_id))
			crawlogger.info("TITLE: %s" %(item0["title"]))
			crawlogger.info( "LINK : %s" %(item0["link"]))
			# print "DESCP: %s\n" %(item0["description"])
			# print "GUID: %s"  %(item0["link"])
			return 1;
Esempio n. 10
0
	def spide(self):
		"""
		The multi-threaded version of crawl
		"""
		future_calls = [Future(feedparser.parse,rss_url) for rss_url in self.channels]
		# block until they are all in
		feeds = [future_obj() for future_obj in future_calls]
		entries = []
		for feed in feeds:
		    entries.extend(feed[ "items" ] )
		# sorted_entries = sorted(entries, key=lambda entry: entry["date_parsed"])
		# sorted_entries.reverse() # for most recent entries first
		# for item in sorted_entries:
		minutes = 0
		for item in entries:
			minutes += self.saveFeed(item)
		crawlogger.info("NewLy Fetched %d Feeds between %d entries" %(minutes, len(entries)))
		if len(entries):
			newly = float(minutes)/float(len(entries))
			self.minutes =5*(1-newly)
		crawlogger.info("%s : Gonnoa have a %d minutes SNAP" %(datetime.now(), self.minutes))
		return self.minutes