def add_users_by_screen_names(self, screen_names): if 'keys' not in self: raise CollectionConfigException( 'Keys are required to add users by screen name.') keys = self['keys'] twarc = Twarc(keys['consumer_key'], keys['consumer_secret'], keys['access_token'], keys['access_token_secret']) # Lower case to original case screen_name_case_map = {} for screen_name in screen_names: clean_screen_name = screen_name.lstrip('@') if clean_screen_name: screen_name_case_map[ clean_screen_name.lower()] = clean_screen_name if 'users' not in self: self['users'] = {} delete_users = [] for user in twarc.user_lookup(screen_name_case_map.keys(), id_type='screen_name'): if user['id_str'] not in self['users']: self['users'][user['id_str']] = { 'screen_name': user['screen_name'] } delete_users.append(user['screen_name'].lower()) for screen_name in delete_users: del screen_name_case_map[screen_name] return screen_name_case_map.values()
def overhear_conversation(graph=Graph("bolt://*****:*****@ssw0rd")), USERS_PATH=Path('data/users/')): t = Twarc(credentials.CONSUMER_KEY, credentials.CONSUMER_SECRET, credentials.ACCESS_TOKEN, credentials.ACCESS_TOKEN_SECRET) user_ids = [] # loop through csvs of usernames and grab them for doc in (USERS_PATH).glob('*.*'): with open(doc) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') next(csv_reader) for row in csv_reader: user_ids.append(row[1]) # use twarc to call those usernames and grab their stats for user in t.user_lookup(user_ids): with open(USERS_PATH / 'user_stats.csv', 'a+') as csv_file: writer = csv.writer(csv_file, delimiter='|') writer.writerow([ user['name'], user['screen_name'], user['followers_count'], user['statuses_count'] ]) # loop through the stats and store them in the graph with open(USERS_PATH / 'user_stats.csv', 'r') as file: csv_reader = csv.reader(file, delimiter='|') for row in csv_reader: u_param = { 'surname': last_name(row[0]), 'user_name': row[1], 'follower_count': row[2], 'statuses_count': row[3] } print(u_param) u_query = ''' MATCH (r:Rep {surname: $surname}) SET r.user_name=$user_name, r.followers=$follower_count, r.statuses=$statuses_count ''' graph.run(u_query, u_param)
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__( self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get( "web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get( "user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets( self.twarc.filter(track=track, follow=follow, locations=locations, event=self.stop_harvest_seeds_event)) def sample(self): self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event)) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug( "Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {} because account is not found or suspended".format( screen_name) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) # if can't find the screen_name, ignore get timeline if not new_screen_name: msg = "Screen name not found for user id {} because account is not found or suspended".format( user_id) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id)) # reset the user_id, ignore the get timeline user_id = None if new_screen_name and new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state( __name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets( self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: account = "user {} (User ID: {})".format( screen_name, user_id ) if screen_name else "user ID: {}".format(user_id) msg = "Unauthorized for {} because account is suspended or protected".format( account) log.exception(msg) self.result.warnings.append( Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ try: users = list(self.twarc.user_lookup(user_ids=(user_id, ))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ try: users = list(self.twarc.user_lookup(screen_names=(screen_name, ))) assert len(users) in (0, 1) if users: return users[0]["id_str"] except HTTPError as e: if e.response.status_code != 404: raise e return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match( url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state( __name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state( __name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append( tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])
'user_created_at', 'verified', 'protected']) writer.writerows(user_map.values()) def clean_cell(cell): if cell and cell != '-': return cell.strip() return None if __name__ == '__main__': user_map = {} for user in csv_iter(['Senate_Press_Galleries.csv', 'Senate_Periodical_Galleries.csv', 'Radio_and_Television.csv']): user_map[user['screen_name']] = user for user_json in t.user_lookup(screen_names=user_map.keys()): user = user_map.get(user_json['screen_name'].lower()) if user: user['user_id'] = user_json['id_str'] user['followers_count'] = user_json['followers_count'] user['following_count'] = user_json['friends_count'] user['tweet_count'] = user_json['statuses_count'] user['user_created_at'] = user_json['created_at'] user['verified'] = user_json['verified'] user['protected'] = user_json['protected'] # Remove entries without match screen_names = list(user_map.keys()) for screen_name in screen_names: if 'user_id' not in user_map[screen_name]: del user_map[screen_name]
print(result) return result t = Twarc(credentials.CONSUMER_KEY,credentials.CONSUMER_SECRET, credentials.ACCESS_TOKEN,credentials.ACCESS_TOKEN_SECRET) USERS_PATH = Path('data/users/') user_ids=[] # loop through csvs of usernames and grab them for doc in (USERS_PATH).glob('*.*'): with open(doc) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') next(csv_reader) for row in csv_reader: user_ids.append(row[1]) # use twarc to call those usernames and grab their stats for user in t.user_lookup(user_ids): with open(USERS_PATH/'user_stats.csv','a+') as csv_file: writer = csv.writer(csv_file,delimiter='|') writer.writerow([user['name'],user['screen_name'],user['followers_count'],user['statuses_count']]) # loop through the stats and store them in the graph with open(USERS_PATH/'user_stats.csv','r') as file: csv_reader = csv.reader(file, delimiter='|') for row in csv_reader: u_param = { 'surname': last_name(row[0]), 'user_name': row[1], 'follower_count': row[2], 'statuses_count': row[3] } print(u_param)
def user_info_crawler(screen_name, user_dir, user_profile_f, user_profileimg_f, user_tweets_f, user_clean_tweets_f): try: # crawl user profile # sys.stdout.write('Get user profile >> ') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_profile_f)): t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret) user_profile_data = t.user_lookup(ids=[screen_name], id_type="screen_name") for user_profile in user_profile_data: with open(os.path.join(user_dir, user_profile_f), 'w') as outfile: json.dump(user_profile, outfile) # crawl user profile image # sys.stdout.write('Get user profile image >> ') # sys.stdout.flush() with open(os.path.join(user_dir, user_profile_f), 'r') as rf: user_profile_json = json.load(rf) if not os.path.exists(os.path.join(user_dir, user_profileimg_f)): # extract user profile image url user_profileimg_url = user_profile_json['profile_image_url'] def image_converter(user_profileimg_url): tmp_file = '../data/user/tmp' + user_profileimg_url[-4:] if sys.version_info[0] == 2: urllib.urlretrieve(user_profileimg_url, tmp_file) elif sys.version_info[0] == 3: urlretrieve(user_profileimg_url, tmp_file) from PIL import Image im = Image.open(tmp_file) rgb_im = im.convert('RGB') rgb_im.save(os.path.join(user_dir, user_profileimg_f)) os.remove(tmp_file) if user_profileimg_url: user_profileimg_url = user_profileimg_url.replace('_normal', '_bigger') image_converter(user_profileimg_url) # crawl user tweets # sys.stdout.write('Get user tweets >> ') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_tweets_f)): user_timeline_data = t.timeline(screen_name=screen_name) with open(os.path.join(user_dir, user_tweets_f), 'a') as outfile: for user_timeline in user_timeline_data: json.dump(user_timeline, outfile) outfile.write('\n') # clean user tweets # sys.stdout.write('Clean user tweets \n') # sys.stdout.flush() if not os.path.exists(os.path.join(user_dir, user_clean_tweets_f)): tweet_raw_lines = [] with open(os.path.join(user_dir, user_tweets_f), 'r') as rf: for line in rf: tweet_raw_lines.append(json.loads(line)['full_text']) clean_tweets = process_raw_tweets(tweet_raw_lines) with open(os.path.join(user_dir, user_clean_tweets_f), 'w') as wf: for tweet in clean_tweets: if len(tweet) > 0: wf.write(tweet + '\n') wf.close() return user_profile_json except Exception as e: # print(e) print("Could not predict user's role. Check account info, few tweets, incorrect image format...")
class TwitterHarvester(BaseHarvester): def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False, connection_errors=5, http_errors=5, debug_warcprox=False, tries=3): BaseHarvester.__init__(self, working_path, mq_config=mq_config, stream_restart_interval_secs=stream_restart_interval_secs, debug=debug, debug_warcprox=debug_warcprox, tries=tries) self.twarc = None self.connection_errors = connection_errors self.http_errors = http_errors self.extract_media = False self.extract_web_resources = False self.extract_user_profile_images = False def harvest_seeds(self): # Create a twarc self._create_twarc() # Get harvest extract options. self.extract_media = self.message.get("options", {}).get("media", False) self.extract_web_resources = self.message.get("options", {}).get("web_resources", False) self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False) # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.search() elif harvest_type == "twitter_filter": self.filter() elif harvest_type == "twitter_sample": self.sample() elif harvest_type == "twitter_user_timeline": self.user_timeline() else: raise KeyError def _create_twarc(self): self.twarc = Twarc(self.message["credentials"]["consumer_key"], self.message["credentials"]["consumer_secret"], self.message["credentials"]["access_token"], self.message["credentials"]["access_token_secret"], http_errors=self.http_errors, connection_errors=self.connection_errors) def search(self): assert len(self.message.get("seeds", [])) == 1 incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None self._harvest_tweets(self.twarc.search(query, since_id=since_id)) def filter(self): assert len(self.message.get("seeds", [])) == 1 track = self.message["seeds"][0]["token"].get("track") follow = self.message["seeds"][0]["token"].get("follow") locations = self.message["seeds"][0]["token"].get("locations") self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations)) def sample(self): self._harvest_tweets(self.twarc.sample()) def user_timeline(self): incremental = self.message.get("options", {}).get("incremental", False) for seed in self.message.get("seeds", []): seed_id = seed["id"] screen_name = seed.get("token") user_id = seed.get("uid") log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id) assert screen_name or user_id # If there is not a user_id, look it up. if screen_name and not user_id: user_id = self._lookup_user_id(screen_name) if user_id: # Report back if nsid found self.result.uids[seed_id] = user_id else: msg = "User id not found for user {}".format(screen_name) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg)) # Otherwise, get the current screen_name else: new_screen_name = self._lookup_screen_name(user_id) if new_screen_name != screen_name: self.result.token_updates[seed_id] = new_screen_name screen_name = new_screen_name if user_id: try: # Get since_id from state_store since_id = self.state_store.get_state(__name__, "timeline.{}.since_id".format( user_id)) if incremental else None self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id)) except HTTPError as e: if e.response.status_code == 401: msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id) log.exception(msg) self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg)) else: raise e def _lookup_screen_name(self, user_id): """ Lookup a screen name given a user id. """ users = list(self.twarc.user_lookup(user_ids=(user_id,))) assert len(users) in (0, 1) if users: return users[0]["screen_name"] return None def _lookup_user_id(self, screen_name): """ Lookup a user id given a screen name. """ users = list(self.twarc.user_lookup(screen_names=(screen_name,))) assert len(users) in (0, 1) if users: return users[0]["id_str"] return None def _harvest_tweets(self, tweets): # max_tweet_id = None for count, tweet in enumerate(tweets): if not count % 100: log.debug("Harvested %s tweets", count) self.result.harvest_counter["tweets"] += 1 if self.stop_harvest_seeds_event.is_set(): log.debug("Stopping since stop event set.") break def _process_entities(self, entities): if self.extract_web_resources: for url in entities.get("urls", []): # Exclude links for tweets if url["expanded_url"] and not status_re.match(url["expanded_url"]): self.result.urls.append(url["expanded_url"]) if self.extract_media: for media in entities.get("media", []): if media["media_url"]: self.result.urls.append(media["media_url"]) def process_warc(self, warc_filepath): # Dispatch message based on type. harvest_type = self.message.get("type") log.debug("Harvest type is %s", harvest_type) if harvest_type == "twitter_search": self.process_search_warc(warc_filepath) elif harvest_type == "twitter_filter": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_sample": self._process_tweets(TwitterStreamWarcIter(warc_filepath)) elif harvest_type == "twitter_user_timeline": self.process_user_timeline_warc(warc_filepath) else: raise KeyError def process_search_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) query = self.message["seeds"][0]["token"] since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath)) # Update state store if incremental and max_tweet_id > since_id: self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id) def process_user_timeline_warc(self, warc_filepath): incremental = self.message.get("options", {}).get("incremental", False) for count, status in enumerate(TwitterRestWarcIter(warc_filepath)): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: user_id = tweet["user"]["id_str"] if incremental: # Update state key = "timeline.{}.since_id".format(user_id) self.state_store.set_state(__name__, key, max(self.state_store.get_state(__name__, key), tweet.get("id"))) self._process_tweet(tweet) def _process_tweets(self, warc_iter): max_tweet_id = None for count, status in enumerate(warc_iter): tweet = status.item if not count % 100: log.debug("Processing %s tweets", count) if "text" in tweet: max_tweet_id = max(max_tweet_id, tweet.get("id")) self._process_tweet(tweet) return max_tweet_id def _process_tweet(self, tweet): self.result.increment_stats("tweets") # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects statuses = [tweet] if "retweeted_status" in tweet: statuses.append(tweet["retweeted_status"]) elif "quoted_status" in tweet: statuses.append(tweet["quoted_status"]) for status in statuses: self._process_entities(status.get("entities", {})) self._process_entities(status.get("extended_entities", {})) if self.extract_user_profile_images: self.result.urls.append(tweet["user"]["profile_image_url"]) self.result.urls.append(tweet["user"]["profile_background_image_url"]) if "profile_banner_url" in tweet["user"]: self.result.urls.append(tweet["user"]["profile_banner_url"])