##when data has been pushed --> write it to disk try: d = json.loads(data) creation = d['created_at'] id = d['id'] text = d['text'].encode("UTF-8") name = d['user']['name'].encode("UTF_8") screen_name = d['user']['screen_name'].encode("UTF-8") lang = d['user']['lang'] #print d['coordinates'] row = creation, id, name, screen_name, lang, text csvWriter.writerow(row) except: pass return True def on_error(self, status_code): print status_code return True def on_timeout(self): print "Timeout..." return True ##start the stream streamer = Stream(auth, tweetStreamer()) streamer.new_session() streamer.filter(track=["Syria"])
def stream(terms): terms = ["Syria"] stream = Stream(auth, tweetStream()) stream.new_session() stream.filter(track=terms, async=True) ##non-blocking --> kind of a nightmare, but the only way to still process the queue
class App(): def __init__(self): self.stream = None self.on = False self.streaming = False self.trackTerm = "" self.initTime = None self.timeSinceInit = 0 self.totalCollectedTweets = 0 self.numOutput = 0 self.numInQueue = 0 self.sessionQueue = None self.analytics = analytics() @cherrypy.expose def start(self): try: if self.on == False: self.sessionQueue = Queue.Queue() self.stream = Stream(auth, tweetStream(self.sessionQueue)) self.stream.new_session() self.on = True self.initTime = datetime.datetime.now() currentTime = datetime.datetime.now() self.timeSinceInit = currentTime - self.initTime # user sessions if 'count' not in cherrypy.session: cherrypy.session['count'] = 0 cherrypy.session['count'] += 1 return json.dumps({"success": True, 'data': None, "message": "New stream session initialized."}) else: return json.dumps({"Success" : True, 'data':None, "message" : "Stream already started."}) except Exception as e: self.on = False return json.dumps({"success": True, 'data': None, "message": "New stream session not started because " + str(e)}) @cherrypy.expose def close(self): try: if self.on: self.stream.disconnect() self.on = False self.streaming = False self.sessionQueue = None self.analytics = analytics() return json.dumps({"success":True, 'data':None, "message":'Stream session terminated.'}) else: return json.dumps({"success" : True, 'data':None, "message" : "Streaming already closed."}) except Exception as e: return json.dumps({"success":False, 'data':None, "message":'Stream session not terminated because ' + str(e)}) @cherrypy.expose def invisibleStream(self, toTrack, *args, **kwargs): try: if not self.streaming: self.stream.filter(track=[toTrack], async=True) self.streaming = True self.trackTerm = toTrack currentTime = datetime.datetime.now() self.timeSinceInit = currentTime - self.initTime return json.dumps({'success' :True, "data":None, "Message":"Stream is streaming."}) except Exception as e: return json.dumps({"success" : False, "data":None, "message" : "Track stream failed because " + str(e)}) @cherrypy.expose def trueStream(self, *args, **kwards): ##streams the connection as continuous json if not self.streaming: return json.dumps({"success" : False, "data":None, "message" : "Track stream failed because the application is not streaming."}) try: def getTweetsFromQueue(): t = 0 while True: t += 1 yield str(self.sessionQueue.get()) self.numOutput += 1 self.stream.disconnect() return getTweetsFromQueue() except Exception as e: return json.dumps({"success" : False, "data":None, "message" : "Track stream failed because " + str(e)}) @cherrypy.expose def pollStream(self): ## publishes all items currently in the queue as a json object # if not self.streaming: # self.invisibleStart(toTrack) ##start streaming if not already conn = psycopg2.connect(connectString) cursor = conn.cursor() outArr = [] allPlaces = [] coords = [] q = 0 queueCopy = self.sessionQueue # for i in range(0, 100): # queueCopy.put(i) self.totalCollectedTweets = self.numInQueue + self.numOutput i = 0 while i < 6 and queueCopy.qsize() > 0: a = queueCopy.get() outArr.append(a) ##puts the tweets into the raw output obj = json.loads(a) hashes = obj['entities']['hashtags'] self.analytics.refreshHashtags(hashes) self.analytics.refreshPlatforms(obj['source']) self.analytics.refreshTimezones(obj['user']['time_zone']) self.analytics.refreshLanguages(obj['lang']) locationText = obj['user']['location'] # if locationText is not None and "/" not in locationText and "&" not in locationText: # tweetPlaces = [] # sql = '''SELECT "recordID", "resolvedTo", latitude, longitude, "rawText" FROM "Places" WHERE "rawText"=''' # sql += "'" + locationText + "';" # cursor.execute(sql) # rows = cursor.fetchall() # for row in rows: # tweetPlaces.append(row[4]) # coords.append([row[2], row[3]]) # if len(rows) == 0: # googleQuery = googlePlacesAPI.text_search(locationText) # for place in googleQuery.places: # sql = '''INSERT INTO "Places"("recordID", "resolvedTo", latitude, longitude, "rawText") VALUES ''' # sql += "(Default, '" + place.name + "'," + str(place.geo_location['lat']) + "," + str(place.geo_location['lng']) + ",'" + locationText + "');" # cursor.execute(sql) # tweetPlaces.append(place.name) # coords.append([place.geo_location['lat'], place.geo_location['lng']]) # try: # theLocation = tweetPlaces[0] ##the first one # self.analytics.refreshLocations(theLocation) # except: # pass q += 1 # allPlaces.append(tweetPlaces) i +=1 # conn.commit() self.numOutput += q out = { "success" :True, "data" : outArr, "timezones": sorted(self.analytics.timezones.items(), key=lambda x: x[1], reverse=True), "platforms" : sorted(self.analytics.platforms.items(), key=lambda x: x[1], reverse=True), "languages" : sorted(self.analytics.languages.items(), key=lambda x: x[1], reverse=True), 'hashtags': sorted(self.analytics.hashtags.items(), key=lambda x: x[1], reverse=True), "locations" : sorted(self.analytics.locations.items(), key=lambda x: x[1], reverse=True), ##"coordinates" : coords, "status": { "timeSinceInit" : str(self.getTimeSinceInit()), "streaming" :str(self.streaming), "connected" : str(self.on), "streamStarted" : str(self.initTime), "inQueue" : str(self.sessionQueue.qsize()), "processed" :str(self.numOutput), "totalCollected" : str(self.totalCollectedTweets), 'elapsedSeconds': self.getTimeSinceInit().total_seconds(), 'avgTPS' : self.getAvgTPS() }, "message": "Request fulfilled successfully. Data is still streaming." } conn.close() return json.dumps(out) def getAvgTPS(self): elapsedSeconds = self.getTimeSinceInit().total_seconds() avgTPS = self.totalCollectedTweets / elapsedSeconds return round(avgTPS, 2) @cherrypy.expose def dashboard(self): return open("dashboard.html") # def processLocationQueue(self): # locationQueueCopy = self.sessionQueue # while locationQueueCopy. @cherrypy.expose def returnStatus(self): self.numInQueue = self.sessionQueue.qsize() self.totalCollectedTweets = self.numInQueue + self.numOutput s = "Server State: " + str(cherrypy.engine.state) + "<br />" s += "Streaming Connected: " + str(self.on) + "<br />" s += "Currently Streaming? " + str(self.streaming) + "<br />" s += "Tracking Term: "+ str(self.trackTerm) + "<br />" s += "Start Time: " + str(self.initTime) + "<br />" s += "Time since initialization: " + str(self.getTimeSinceInit()) + "<br />" s += "Queue size: " + str(self.sessionQueue.qsize()) + '<br />' s += "Printed tweets: " + str(self.numOutput) + "<br />" s += 'Total collected tweets: ' + str(self.totalCollectedTweets) + "<br />" return s def getTimeSinceInit(self): if self.on: currentTime = datetime.datetime.now() self.timeSinceInit = currentTime - self.initTime print self.timeSinceInit return self.timeSinceInit else: return 0 @cherrypy.expose def getNumInQueue(self): return str(self.numInQueue) @cherrypy.expose def getNumPrinted(self): return str(self.numOutput) @cherrypy.expose def getTotalCollected(self): self.totalCollectedTweets = self.numInQueue + self.numOutput return str(self.totalCollectedTweets) @cherrypy.expose def getSession(self, key=None): if key is None: return str(cherrypy.session.__dict__) else: if key in cherrypy.session: return str(cherrypy.session[key]) else: return str(None)
def streamFunction(track): stream = Stream(auth, tweetStream()) stream.new_session() stream.filter(track=[track]) print "Running thread."