class TweetFilterTimelineDownloadable(TweetFilter): def __init__(self, twython, download_path, minimum_tweet_threshold, logger=None): self._crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) self._download_path = download_path self._minimum_tweet_threshold = minimum_tweet_threshold self._twython = twython TweetFilter.__init__(self, logger=logger) def filter(self, json_tweet_string): tweet = json.loads(json_tweet_string) screen_name = tweet['user']['screen_name'] path_to_tweetfile = os.path.join(self._download_path, "%s.tweets" % screen_name) # If file already exists for user, don't try to rescrape their timeline if os.path.exists(path_to_tweetfile): self._logger.info("Timeline file for '%s' already exists - will not rescrape" % screen_name) if os.path.getsize(path_to_tweetfile) > 0: return True else: return False try: self._logger.info("Retrieving Tweets for user '%s'" % screen_name) tweets = self._crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: self._logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) open(path_to_tweetfile, "w").close() # Create empty file return False elif e.error_code == 401: self._logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) open(path_to_tweetfile, "w").close() # Create empty file return False else: # Unhandled exception raise e else: if len(tweets) < self._minimum_tweet_threshold: self._logger.info("User '%s' has only %d Tweets, threshold is %d" % \ (screen_name, len(tweets), self._minimum_tweet_threshold)) open(path_to_tweetfile, "w").close() # Create empty file return False else: save_tweets_to_json_file(tweets, path_to_tweetfile) return True
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: tweet_filename = "%s.tweets" % screen_name if os.path.exists(tweet_filename): logger.info( "File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: logger.info("Retrieving Tweets for user '%s'" % screen_name) tweets = crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn( "HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn( "HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) parser = argparse.ArgumentParser(description="") parser.add_argument('screen_name_file') args = parser.parse_args() logger = get_console_info_logger() ACCESS_TOKEN = Twython(consumer_key, consumer_secret, oauth_version=2).obtain_access_token() twython = Twython(consumer_key, access_token=ACCESS_TOKEN) crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) screen_names = get_screen_names_from_file(args.screen_name_file) for screen_name in screen_names: tweet_filename = "%s.tweets" % screen_name if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: logger.info("Retrieving Tweets for user '%s'" % screen_name) tweets = crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, "%s.tweets" % screen_name)
def main(): # Make stdout output UTF-8, preventing "'ascii' codec can't encode" errors sys.stdout = codecs.getwriter('utf8')(sys.stdout) # Parse and document command line options parser = argparse.ArgumentParser(description="") parser.add_argument('-sn', dest='screen_name_file', default="example_screen_names.txt", help='A text file with one screen name per line.') parser.add_argument('-t', dest='token_file', default=os.path.expanduser("~") + "/.trawler/default.yaml", help='A configuration file with Twitter API access tokens. See example_token_file.yaml.') parser.add_argument('-d', dest='depth', default=0, help='Friend and follower depth. A value of 1 will gather all tweets for users \ in the file as well as all tweets from their friends and followers. Default is 0.') args = parser.parse_args() # Set up loggers and output directory logger = get_console_info_logger() output_directory = "data/" + datetime.datetime.now().isoformat() + "/" try: if not os.path.exists(output_directory): os.makedirs(output_directory) except: print "Could not create directory:", directory exit(0) logger.info("Created directory: %s" % output_directory) # Set up API access tokens = yaml.safe_load(open(args.token_file)) ACCESS_TOKEN = Twython(tokens['consumer_key'], tokens['consumer_secret'], oauth_version=2).obtain_access_token() twython = Twython(tokens['consumer_key'], access_token=ACCESS_TOKEN) crawler = RateLimitedTwitterEndpoint(twython, "statuses/user_timeline", logger) # Gather unique screen names screen_names = get_screen_names_from_file(args.screen_name_file) depth = int(args.depth) # todo, validate args.depth unique_screen_names = [] if depth > 0: # don't initiate ff_finder unless we have to ff_finder = FindFriendFollowers(twython, logger) ff_screen_names = get_ff(screen_names, depth, ff_finder, logger) unique_screen_names = set(ff_screen_names) else: unique_screen_names = set(screen_names) # assume the list has redundant names save_screen_names_to_file(unique_screen_names, output_directory + 'screen_names') # Gather tweets for each of the unique screen names for screen_name in unique_screen_names: tweet_filename = output_directory + screen_name + ".tweets" if os.path.exists(tweet_filename): logger.info("File '%s' already exists - will not attempt to download Tweets for '%s'" % (tweet_filename, screen_name)) else: try: logger.info("Retrieving Tweets for user " + screen_name + " writing to file " + tweet_filename) tweets = crawler.get_data(screen_name=screen_name, count=200) except TwythonError as e: print "TwythonError: %s" % e if e.error_code == 404: logger.warn("HTTP 404 error - Most likely, Twitter user '%s' no longer exists" % screen_name) elif e.error_code == 401: logger.warn("HTTP 401 error - Most likely, Twitter user '%s' no longer publicly accessible" % screen_name) else: # Unhandled exception raise e else: save_tweets_to_json_file(tweets, tweet_filename)