class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors def harvest_seeds(self): # Create a twarc self._create_twarc() # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors, tweet_mode="extended") def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None query, geocode = self._search_parameters() self._harvest_tweets( self.twarc.search(query, geocode=geocode, since_id=since_id)) def _search_parameters(self): if type(self.message["seeds"][0]["token"]) is dict: query = self.message["seeds"][0]["token"].get("query") geocode = self.message["seeds"][0]["token"].get("geocode") else: query = self.message["seeds"][0]["token"] geocode = None return query, geocode def _search_id(self): query, geocode = self._search_parameters() if query and not geocode: return query if geocode and not query: return geocode return ":".join([query, geocode]) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") language = self.message["seeds"][0]["token"].get("language") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, lang=language, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: result, user = self._lookup_user(screen_name, "screen_name") if result == "OK": user_id = user["id_str"] self.result.uids[seed_id] = user_id else: msg = u"User id not found for {} because account is {}".format( screen_name, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("token_{}".format(result), msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: result, user = self._lookup_user(user_id, "user_id") if result == "OK": new_screen_name = user["screen_name"] if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name else: msg = u"User {} (User ID: {}) not found because account is {}".format( screen_name, user_id, self._result_to_reason(result)) log.exception(msg) self.result.warnings.append( Msg("uid_{}".format(result), msg, seed_id=seed_id)) user_id = None if user_id: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) def _lookup_user(self, id, id_type): url = "https://api.twitter.com/1.1/users/show.json" params = {id_type: id} # USER_DELETED: 404 and {"errors": [{"code": 50, "message": "User not found."}]} # USER_PROTECTED: 200 and user object with "protected": true # USER_SUSPENDED: 403 and {"errors":[{"code":63,"message":"User has been suspended."}]} result = "OK" user = None try: resp = self.twarc.get(url, params=params, allow_404=True) user = resp.json() if user['protected']: result = "unauthorized" except requests.exceptions.HTTPError as e: try: resp_json = e.response.json() except json.decoder.JSONDecodeError: raise e if e.response.status_code == 404 and self._has_error_code( resp_json, 50): result = "not_found" elif e.response.status_code == 403 and self._has_error_code( resp_json, 63): result = "suspended" else: raise e return result, user @staticmethod def _has_error_code(resp, code): if isinstance(code, int): code = (code, ) for error in resp['errors']: if error['code'] in code: return True return False @staticmethod def _result_to_reason(result): if result == "unauthorized": return "protected" elif result == "suspended": return "suspended" return "not found or deleted" def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) since_id = self.state_store.get_state( __name__, u"{}.since_id".format( self._search_id())) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and (max_tweet_id or 0) > (since_id or 0): self.state_store.set_state( __name__, u"{}.since_id".format(self._search_id()), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max( self.state_store.get_state(__name__, key) or 0, tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet or "full_text" in tweet: max_tweet_id = max(max_tweet_id or 0, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, _): self.result.increment_stats("tweets")
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get( "web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get( "user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {} because account is not found or suspended".format( screen_name) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) # if can't find the screen_name, ignore get timeline if not new_screen_name: msg = "Screen name not found for user id {} because account is not found or suspended".format( user_id) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # reset the user_id, ignore the get timeline user_id = None if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: account = "user {} (User ID: {})".format( screen_name, user_id ) if screen_name else "user ID: {}".format(user_id) msg = "Unauthorized for {} because account is suspended or protected".format( account) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ try: users = list(self.twarc.user_lookup(user_ids=(user_id, ))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ try: users = list(self.twarc.user_lookup(screen_names=(screen_name, ))) assert len(users) in (0, 1) if users: return users[0]["id_str"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match( url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append( tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])