def paginate_user_timeline(self, username, earliest_date): endpoint = 'timeline' self.__update_current_app(endpoint, 16) header = "user_id,tweet_id,created_at,text,coordinates,lang,retweet_count,favorite_count,in_reply_to_tweet_id,in_reply_to_user_id,place" w = wt.WriteTwitter('data/chapters/tweets/' + username + '_tweets.csv', header) args = {'screen_name': username, 'count': 200, 'trim_user': '******'} next_max_id = -999 counter = 0 is_last = False while not is_last: if counter > 0: args['max_id'] = next_max_id r = self.app[self.current_app_index].request('statuses/user_timeline', args) self.status[self.current_app_index][endpoint] = int(r.headers._store['x-rate-limit-remaining'][1]) if r.status_code != 200: print r.status_code self.__update_current_app(endpoint, 16 - counter) else: next_max_id, is_last = w.write_tweets(r, earliest_date) counter += 1 if self.status[self.current_app_index][endpoint] == 0: self.__update_current_app(endpoint, 1) w.close()
def paginate_search(self, max_id, hashtags): endpoint = 'search' self.__update_current_app(endpoint, 300) header = "user_id,tweet_id,created_at,text,coordinates,lang,retweet_count,favorite_count,in_reply_to_tweet_id,in_reply_to_user_id,place" w = wt.WriteTwitter('data/hashtags/' + str(max_id) + '_tweets.csv', header) html_hashtags = "%20OR%20%23".join(hashtags) html_hashtags = "%23" + html_hashtags args = {'q': html_hashtags, 'count': 100, 'max_id': max_id, 'include_entities': 'false'} earliest_date = time.strptime('2000-01-01', '%Y-%m-%d') while self.status[self.current_app_index][endpoint] > 1: args['max_id'] = max_id r = self.app[self.current_app_index].request('search/tweets', args) self.status[self.current_app_index][endpoint] = int(r.headers._store['x-rate-limit-remaining'][1]) if r.status_code != 200: print 'WARNING: status not 200!!!' self.__update_current_app(endpoint, 300) else: max_id, temp = w.write_tweets(r, earliest_date) w.close() return max_id
def paginate_anon_users(self, anon_map): header = 'anon_id,created_at,description,lang,location,time_zone,utc_offset,statuses_count,favourites_count,followers_count,friends_count,listed_count,contributors_enabled,protected,verified' with open(anon_map, 'r') as id_file: all_ids = id_file.read().splitlines() n_ids = len(all_ids) map = dict() for row in all_ids: tokens = row.split(',') num_id = long(tokens[0]) map[num_id] = int(tokens[1]) int_all_ids = map.keys() new_counter = 0 file_name = 'data/anon_users_hashtag_tweets.csv' w = wt.WriteTwitter(file_name, header, datetime.date.today().strftime('%Y-%m-%d')) while new_counter < n_ids: old_counter = new_counter new_counter = min(new_counter + 100, n_ids) user_ids = ','.join(str(x) for x in int_all_ids[old_counter:new_counter]) success = False while not success: success = self.ta.get_hydrated_anon_users(user_ids, w, map) w.close()
def get_followers_of_followers(self): input_files = glob.glob(os.getcwd() + '/crawlLists/in_list_[0-9][0-9][0-9][0-9]') input_files.sort() date_string = datetime.date.today().strftime('%Y-%m-%d') for this_file in input_files: n_successful = 0 n_failed = 0 file_number = this_file[-3:] w = wt.WriteTwitter('data/followersOfFollowers/out_list_' + file_number + '.csv', '', date_string) with open(this_file, 'r') as file_reader: all_lines = file.read(file_reader).splitlines() for line in all_lines: try: success = self.ta.paginate_followers(line, True, True, w, file_number) except: success = False w.write_to_errlog(file_number, line, 'UNKNOWN') if success: n_successful += 1 else: n_failed += 1 w.write_to_log(file_number, n_successful, n_failed) w.close() os.system('mv ' + this_file + ' ' + this_file + '_done')
def __paginate_hydrated_users(self, in_list, has_header): file_number = in_list[-2:] header = 'user_id,name,screen_name,created_at,description,lang,location,url,time_zone,utc_offset,statuses_count,favourites_count,followers_count,friends_count,listed_count,contributors_enabled,protected,verified' with open(in_list, 'r') as id_file: if has_header: id_file.readline() all_ids = id_file.read().splitlines() n_ids = len(all_ids) new_counter = 0 file_name = 'data/users/hydrated_out_list_' + file_number + '.csv' w = wt.WriteTwitter(file_name, header, datetime.date.today().strftime('%Y-%m-%d')) while new_counter < n_ids: old_counter = new_counter new_counter = min(new_counter + 100, n_ids) user_ids = ",".join(all_ids[old_counter:new_counter]) success = False while not success: success = self.ta.get_hydrated_users(user_ids, w) w.close() os.system('mv ' + in_list + ' ' + in_list + '_done')
def paginate_retweeters(self, tweet_id, retweet_count): self.__update_current_app('retweeters', math.ceil(retweet_count / 100)) w = wt.WriteTwitter('data/retweets/' + str(max_id) + '_tweets.csv', header) next_cursor = -1 args = {'id': tweet_id, 'cursor': next_cursor, 'stringify_ids': 'true'} while self.status[self.current_app_index]["retweeters"] > 0 and next_cursor != 0: # args['max_id'] = max_id # r = self.app[self.current_app_index].request('search/tweets', args) self.status[self.current_app_index]["retweeters"] -= 1 if r.status_code != 200: print 'WARNING: status not 200!!!' else: next_cursor = w.write_retweeters(r) w.close()
def get_chapter_followers(self): all_chapters = get_all_chapter_handles() for chapter in all_chapters: w = wt.WriteTwitter('data/followers/' + chapter + '_followers.csv', '') self.ta.paginate_followers(chapter, False, False, w, '') w.close()