def receive_tweet(incidents, search_queue, tweet): """ Take a tweet and process it, possibly adding it to an incident or creating a new one out of it """ global last_search_time, search_interval logger.info("Received %s" % tweet) # disregard retweets if (tweet.retweet_of is not None): return tweet = db.merge(tweet) offered = False first_inactive_inc = None for inc in reversed(incidents): # stop when we reach the inactive incidents. if a new incident has # become inactive since we last added a tweet, refresh the tracked users # this way depends on the incidents list being sorted by inactive time if not inc.active(): if inc is not first_inactive_inc: first_inactive_inc = inc break # try to add the tweet to an incident. if inc.offer_tweet(tweet): logger.info("Found %s for %s" % (inc, tweet)) # TODO: check if it's okay to just discard this tweet because # incidents will find the tweet themselves when they backfill offered = True newlen = len(inc.tweets) + len(inc.stray_tweets) update_histo(newlen-1, newlen) # make incidents for any tweets unrelated to current incidents if not offered and incident_tweet(tweet): inc = Incident(tweet) incidents.append(inc) update_histo(0, 1) search_queue.put(inc) logger.info("Created incident for %s" % tweet) # every search-interval seconds, backfill the oldest-updated incident if time() - last_search_time >= search_interval: next_incident = search_queue.get() logger.info("Doing backfill on %s" % next_incident) oldlen = len(next_incident.tweets) + len(next_incident.stray_tweets) next_incident.backfill_tweets() db.commit() newlen = len(next_incident.tweets) + len(next_incident.stray_tweets) update_histo(oldlen, newlen) if next_incident.active: search_queue.put(next_incident) last_search_time = time() logger.info("%s incidents: %s" % (len(incidents), get_histo()))
def query_twitter(how_long=0, interval=5): """ Interface function """ reset_location_cache() # can send 180 requests per 15 min = 5 sec start = time() # make sure we don't create duplicates. # keeping track of this ourselves saves many db hits # if we don't specify go indefinitely last_tweet_id = 0 while time() - start < how_long: tweets = search(search_terms, last_tweet_id) if not tweets: # if we dont get anything back, sleep and try again sleep(interval) continue # if a retrieved tweet has a loc/user with a matching ID already in the # db, that loc/user is updated instead of a new one added, bc of merge try: db.add_all([db.merge(tweet_to_Tweet(t)) for t in tweets]) db.commit() last_tweet_id = tweets[0]['id_str'] except OperationalError: pass sleep(interval)