def test_stream(): t = Twarc() count = 0 for tweet in t.stream("obama"): assert tweet['id_str'] assert tweet['text'] count += 1 if count == 50: break assert count == 50
class TwitterStreamKafka(object): # WORKING TWITTER HOSE def __init__(self, search_terms): logging.info("initializing TwitterStream Kafka") # globals to all instances self.t = Twarc(localConfig.client_key, localConfig.client_secret, localConfig.access_token, localConfig.access_token_secret) self.search_terms = search_terms # method to capture twitter stream def captureStream(self): for tweet in self.t.stream(",".join(self.search_terms)): result = producer.send_messages("betweezered", json.dumps(tweet))
class TwitterHarvester(BaseHarvester): def __init__(self, process_interval_secs=1200, mq_config=None, debug=False): BaseHarvester.__init__(self, mq_config=mq_config, process_interval_secs=process_interval_secs, debug=debug) self.twarc = None def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"]) def search(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): query = seed.get("token") # Get since_id from state_store since_id = self.state_store.get_state(__name__, "{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(self.twarc.search(query, since_id=since_id)) log.debug("Searching on %s since %s returned %s tweets.", query, since_id, self.harvest_result.summary.get("tweet")) # Update state store if incremental and max_tweet_id: self.state_store.set_state(__name__, "{}.since_id".format(query), max_tweet_id) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"] self._process_tweets(self.twarc.stream(track)) def _process_tweets(self, tweets): max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Processed %s tweets", count) if self.stop_event.is_set(): log.debug("Stopping since stop event set.") break if "text" in tweet: with self.harvest_result_lock: max_tweet_id = max(max_tweet_id, tweet.get("id")) self.harvest_result.increment_summary("tweet") if "urls" in tweet["entities"]: for url in tweet["entities"]["urls"]: self.harvest_result.urls.append(url["expanded_url"]) if "media" in tweet["entities"]: for media in tweet["entities"]["media"]: self.harvest_result.urls.append(media["media_url"]) return max_tweet_id