def get_followers(name): # open spreadsheet and add column heads with open_csv_w('%s_followerlist.csv' % name) as f: writer = csv.writer(f) writer.writerow([ "id", "screen_name", "display_name", "bio", "followers_count", "following_count", "acct_created", "location" ]) # friends_ids returns an array of the ids of all the people the user follows follower_ids = api.followers_ids(screen_name=name) # cycle through every id in the array of people that the user follows and gather information for each one for follower_id in follower_ids: user = None while user is None: try: user = api.get_user(follower_id) except tweepy.error.RateLimitError: print("sleeping for a minute") time.sleep(60) # write the csv with open_csv_w('%s_followerlist.csv' % name) as f: writer = csv.writer(f) writer.writerow([ follower_id, user.screen_name.encode('utf-8'), user.name.encode('utf-8'), user.description.encode('utf-8'), user.followers_count, user.friends_count, user.created_at, user.location.encode('utf-8') ]) print(user.screen_name.encode('utf-8'))
def proof_facebook_ids(fb_id): # constructing the url for an http request for the Facebook API base = 'https://graph.facebook.com/v2.9' page_id = "/%s/" % fb_id extra_parameters = '?access_token=%s' % access_token url = base + page_id + extra_parameters # variables for spreadsheet validity = "" processed_id = fb_id error_message = "" # retrieve data resp = custom_request(url) if resp: validity = True data = json.loads(custom_request(url)) error_message = None else: validity = False r = requests.get(url) response = json.loads(r.text) error_message = response["error"]["message"] # prep data for csv id_data = [validity, processed_id, error_message] #write the csv with open_csv_w('../output/cleaned_ids.csv') as f: writer = csv.writer(f) writer.writerow(id_data)
def get_all_tweets(screen_name): #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET) auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] #make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print("getting tweets before %s" % (oldest)) #all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(alltweets))) #transform the tweepy tweets into a 2D array that will populate the csv | you can comment out data you don't need outtweets = [[ tweet.id_str, tweet.created_at, tweet.favorite_count, tweet.retweet_count, tweet.retweeted, tweet.source.encode("utf-8"), tweet.text.encode("utf-8"), ] for tweet in alltweets] #write the csv with open_csv_w('../output/%s_tweets.csv' % screen_name) as f: writer = csv.writer(f) writer.writerow([ "id", "created_at", "favorites", "retweets", "retweeted", "source", "text" ]) writer.writerows(outtweets) pass
def get_userinfo(name): #set user to be the screen_name user = api.get_user(screen_name=name) # create row userinfo = [ user.id, user.screen_name, user.name, user.description, user.followers_count, user.friends_count, user.favourites_count, user.statuses_count, user.created_at, user.lang, user.location, user.protected, user.verified ] print(userinfo) # write the csv with open_csv_w('../output/userinfo.csv') as f: writer = csv.writer(f) writer.writerows([userinfo]) pass
def scrapeFacebookPageFeedStatus(group_id, access_token): with open_csv_w('../output/%s_facebook_statuses.csv' % group_id) as file: w = csv.writer(file) w.writerow([ "status_id", "status_message", "status_author", "link_name", "status_type", "status_link", "status_published", "num_reactions", "num_comments", "num_shares", "num_likes", "num_loves", "num_wows", "num_hahas", "num_sads", "num_angrys" ]) has_next_page = True num_processed = 0 # keep a count on how many we've processed scrape_starttime = datetime.datetime.now() print("Scraping %s Facebook Page: %s\n" % \ (group_id, scrape_starttime)) statuses = getFacebookPageFeedData(group_id, access_token, 100) while has_next_page: for status in statuses['data']: # Ensure it is a status with the expected metadata if 'reactions' in status: w.writerow(processFacebookPageFeedStatus(status, \ access_token)) # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 100 == 0: print("%s Statuses Processed: %s") % ( num_processed, datetime.datetime.now()) # if there is no next page, we're done. if 'paging' in statuses.keys(): statuses = json.loads(request_until_succeed(\ statuses['paging']['next'])) else: has_next_page = False print("\nDone!\n%s Statuses Processed in %s") % \ (num_processed, datetime.datetime.now() - scrape_starttime)
def get_userinfo(name): #set user to be the screen_name user = api.get_user(screen_name=name) # create row userinfo = [ name.encode('utf-8'), user.name.encode('utf-8'), user.description.encode('utf-8'), user.followers_count, user.friends_count, user.created_at, user.location.encode('utf-8') ] print(userinfo) # write the csv with open_csv_w('userinfo.csv') as f: writer = csv.writer(f) writer.writerows([userinfo]) pass
try: yield cursor.next() except tweepy.error.TweepError: print("waiting 15 minutes for Twitter to let me get more tweets") time.sleep(15 * 60) # counter for console messages counter = 0 # search terms # find a full list of conventions here: https://dev.twitter.com/rest/public/search#query-operators searchterm = "#MuellerReport" # Open/Create a file to append data csvFile = open_csv_w('../output/%s-result.csv' % searchterm) #Use csv Writer csvWriter = csv.writer(csvFile) # these are the headers of your csv csvWriter.writerow( ["id", "authorname", "created_at", "favorites", "retweets", "text"]) # loop to put tweets into the csv for tweet in limit_handled( tweepy.Cursor( api.search, q=searchterm, # note that Twitter only makes available a sample of tweets from the last 7 days: https://dev.twitter.com/rest/public/search # point of time you want the search to start since="2019-01-10", # point of time you want the search to end
'channelId': channelId, 'channelTitle': channelTitle } rows.append(video_data_row) # csv_writer.writerow(video_data_row) else: print('no more posts!') has_another_page = False if __name__ == '__main__': for date_string in date_strings: gatherAPIdata(date_string['start_date'], date_string['end_date']) # make a new csv into which we will write all the rows with open_csv_w('../output/youtube-video-search-results.csv') as csvfile: # these are the header names: fieldnames = [ 'publishedAt', 'title', 'description', 'kind', 'videoID', 'channelId', 'channelTitle' ] # this creates your csv writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # this writes in the first row, which are the headers writer.writeheader() # this loops through your rows (the array you set at the beginning and have updated throughtout) for row in rows: # this takes each row and writes it into your csv writer.writerow(row)
import tweepy from utils import open_csv_w # import authentication credentials from secrets import TWITTER_C_KEY, TWITTER_C_SECRET, TWITTER_A_KEY, TWITTER_A_SECRET #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET) auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET) api = tweepy.API(auth) #Returns most recent 20 status/post of user/user's friends home_timeline = api.home_timeline() with open_csv_w('home_timeline.txt') as f: for tweet in home_timeline: print(tweet.text, file=f) #Returns the 20 most recent mentions, including retweets. mentions_timeline = api.mentions_timeline() with open_csv_w('mentions_timeline.txt') as f: for tweet in mentions_timeline: print(tweet.text, file=f) #returns 20 most recent posts of authenticating user my_timeline = api.user_timeline() with open_csv_w('my_timeline.txt') as f: for tweet in my_timeline: print(tweet.text, file=f) #return 20 most recent post of calling user_id user_timeline = api.user_timeline(user_id=1153916176453513216)
if resp: validity = True data = json.loads(custom_request(url)) error_message = None else: validity = False r = requests.get(url) response = json.loads(r.text) error_message = response["error"]["message"] # prep data for csv id_data = [validity, processed_id, error_message] #write the csv with open_csv_w('../output/cleaned_ids.csv') as f: writer = csv.writer(f) writer.writerow(id_data) # run the proofer if __name__ == '__main__': # set array of IDs you want to proof fb_ids = [] with open_csv_w('cleaned_ids.csv') as f: writer = csv.writer(f) writer.writerow(["id", "valid", "error"]) # iterate through all the for fb_id in fb_ids: proof_facebook_ids(fb_id)
'favoriteCount': favoriteCount, 'commentCount': commentCount, 'topicCategories': topicCategories } rows.append(row) else: print(video_id + " is not a valid ID") if __name__ == '__main__': for video_id in video_ids: get_video_data(video_id) # make a new csv into which we will write all the rows with open_csv_w('../output/youtube-video-information.csv') as csvfile: # these are the header names: fieldnames = [ 'youtube_id', 'publishedAt', 'channelId', 'channelTitle', 'title', 'description', 'tags', 'viewCount', 'likeCount', 'dislikeCount', 'favoriteCount', 'commentCount', 'topicCategories' ] # this creates your csv writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # this writes in the first row, which are the headers writer.writeheader() # this loops through your rows (the array you set at the beginning and have updated throughtout) for row in rows: # this takes each row and writes it into your csv writer.writerow(row)
'viewCount': viewCount, 'subscriberCount': subscriberCount, 'videoCount': videoCount, 'commentCount': commentCount } rows.append(row) else: print(video_id + " is not a valid ID") if __name__ == '__main__': for channel_id in channel_ids: get_channel_data(channel_id) # make a new csv into which we will write all the rows with open_csv_w('../output/youtube-channel-information.csv') as csvfile: # these are the header names: fieldnames = [ 'youtube_id', 'publishedAt', 'title', 'description', 'viewCount', 'subscriberCount', 'videoCount', 'commentCount' ] # this creates your csv writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # this writes in the first row, which are the headers writer.writeheader() # this loops through your rows (the array you set at the beginning and have updated throughtout) for row in rows: # this takes each row and writes it into your csv writer.writerow(row)
try: yield cursor.next() except tweepy.error.TweepError: print("waiting 15 minutes for Twitter to let me get more tweets") time.sleep(15 * 60) # counter for console messages counter = 0 # search terms # find a full list of conventions here: https://dev.twitter.com/rest/public/search#query-operators searchterm = "\"Ben Smith\"" # Open/Create a file to append data csvFile = open_csv_w('%s-result.csv' % searchterm) #Use csv Writer csvWriter = csv.writer(csvFile) # these are the headers of your csv csvWriter.writerow( ["id", "authorname", "created_at", "favorites", "retweets", "text"]) # loop to put tweets into the csv for tweet in limit_handled( tweepy.Cursor( api.search, q=searchterm, # note that Twitter only makes available a sample of tweets from the last 7 days: https://dev.twitter.com/rest/public/search # point of time you want the search to start since="2017-01-10", # point of time you want the search to end
def scrapeFacebookPageFeedComments(page_id, access_token): # with open('%s_facebook_comments.csv' % file_id, 'wb') as file: with open_csv_w('../output/%s_facebook_comments.csv' % file_id) as file: w = csv.writer(file) w.writerow([ "comment_id", "status_id", "parent_id", "comment_message", "comment_author", "comment_published", "comment_likes" ]) num_processed = 0 # keep a count on how many we've processed scrape_starttime = datetime.datetime.now() print("Scraping %s Comments From Posts: %s\n" % \ (file_id, scrape_starttime)) # with open('%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: with open_csv_w('../output/%s_facebook_statuses.csv' % file_id, 'rb') as csvfile: reader = csv.DictReader(csvfile) #reader = [dict(status_id='759985267390294_1158001970921953')] for status in reader: has_next_page = True comments = getFacebookCommentFeedData(status['status_id'], access_token, 100) while has_next_page and comments is not None: for comment in comments['data']: w.writerow( processFacebookComment(comment, status['status_id'])) if 'comments' in comment: has_next_subpage = True subcomments = getFacebookCommentFeedData( comment['id'], access_token, 100) while has_next_subpage: for subcomment in subcomments['data']: # print (processFacebookComment( # subcomment, status['status_id'], # comment['id'])) w.writerow( processFacebookComment( subcomment, status['status_id'], comment['id'])) num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in subcomments: if 'next' in subcomments['paging']: subcomments = json.loads( request_until_succeed( subcomments['paging']\ ['next'])) else: has_next_subpage = False else: has_next_subpage = False # output progress occasionally to make sure code is not # stalling num_processed += 1 if num_processed % 1000 == 0: print("%s Comments Processed: %s" % \ (num_processed, datetime.datetime.now())) if 'paging' in comments: if 'next' in comments['paging']: comments = json.loads( request_until_succeed( comments['paging']['next'])) else: has_next_page = False else: has_next_page = False print("\nDone!\n%s Comments Processed in %s" % \ (num_processed, datetime.datetime.now() - scrape_starttime))
def get_all_tweets(screen_name): #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET) auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] #make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=200) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print("getting tweets before %s" % (oldest)) #all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(alltweets))) #transform the tweepy tweets into a 2D array that will populate the csv | you can comment out data you don't need outtweets = [[ tweet.id_str, tweet.created_at, tweet.favorite_count, tweet.retweet_count, tweet.retweeted, tweet.source, tweet.text, tweet.geo, tweet.lang, tweet.is_quote_status, tweet.user.name, tweet.user.screen_name, tweet.user.location, tweet.user.description, tweet.user.protected, tweet.user.followers_count, tweet.user.friends_count, tweet.user.listed_count, tweet.user.created_at, tweet.user.favourites_count, tweet.user.utc_offset, tweet.user.time_zone, tweet.user.geo_enabled, tweet.user.verified, tweet.user.statuses_count, tweet.user.lang ] for tweet in alltweets] #write the csv with open_csv_w('%s_tweets.csv' % screen_name) as f: writer = csv.writer(f) writer.writerow([ "id", "created_at", "favorites", "retweets", "retweeted", "source", "text", "geolocation", "language", "is_quote_status", "username", "user_screen_name", "user_location", "user_description", "user_protected", "user_followers_count", "user_friends_count", "user_listed_count", "user_created_at", "user_favourites_count", "user_utc_offset", "user_time_zone", "user_geo_enabled", "user_verified", "user_statuses_count", "user_lang" ]) writer.writerows(outtweets) pass