コード例 #1
0
	def on_data(self,data):
		try:

			data = json.loads(data)
			newd = {}

			# Get Tweet
			tweet = Utilities.clean_tweet(data['text'])

			for key in self.recent_tweets:
				#print Utilities.similarity(key,tweet)
				if Utilities.similarity(key,tweet) > 70:
					return
			'''
			if tweet in self.recent_tweets:
				return
			else:
			'''
			if len(self.recent_tweets) > 50:
				self.recent_tweets.popitem(last=False)
			self.recent_tweets[tweet] = True
			#print tweet

			# Get Redirected url
			try:
				url_name = Utilities.get_redirected_url(str(data['entities']['urls'][0]['expanded_url']))
			except:
				return
				raise BaseException("Url for tweet did not exist")

			# Get shortened url for key --> Upto 5th '/' or entire address (whichever is shorter)

			url_name = Utilities.get_shortened_url(url_name).lower()

			#Get timestamp
			timestamp = str(data['created_at'])

			# Verify authenticity of website by checking if it has the word coupon
			# If it does , assume it is not a vendor site. Maybe blog, maybe coupon site

			try:
				Utilities.check_url_validity(url_name)
			except:
				return
				raise BaseException("Url was not a valid site")


			with open("x.txt","a") as f:
				f.write(tweet + '\n')
				f.write("--------------------" + '\n')
			# Code to extract important information from this tweet
			#self.tweets += 1
			#print tweet
			#print "Tweet Number : " + str(self.tweets)
			e = Extraction()
			code,date = e.extract_all(tweet)
			if not code:
				#print " --------------- "
				return
				raise BaseException("Did not have coupon code information")

			if not date :
				date = 183600
			else :
				self.tweets_with_dates += 1
				self.total_expiry_time += date
				self.exp_time.append(date/3600)
				print self.tweets_with_dates, int(numpy.median(self.exp_time))
				#print date
				#print self.tweets_with_dates
				print tweet
				#print " ----------------------------------- "
				#print "Tweet : ",

				#print "Url : ",
				#print url_name
				#print "Date : "


			#print "Coupons : " + str(self.tweets_with_coupons)
			#print "Dates : " + str(self.tweets_with_dates)
			#print "Total Expiry Time :" + str(self.total_expiry_time/3600) + "hours"
			#print "Avg Expiry Time :" + str((self.total_expiry_time/(self.tweets_with_dates+1))/3600) + "hours"
			print '--------------------------------------'

			#print "CODE : " + code
			key = url_name + ':::' + code
			#print "KEY : " + key

			#print "Tweet : "
			#print tweet
			#print "Url : ",
			#print url_name
			#print " ----------------------------------- "

			ds = DataStore()
			#print url_name,code,date
			#get outer url - url uptil 3 '/'s . eg - http://www.etsy.com/
			outer_url = "parent::"+Utilities.get_shortened_url(url_name,3)
			ds.insert(key,url_name,code,tweet,date,outer_url)
			#print '-----------------------'

			return True
		except BaseException as e:
			if str(e) != "'text'":
				#print " *************** " + str(e) + " *************** "
				#print "----------------------------------------"
				pass
			time.sleep(1)