コード例 #1
0
    def paginate_user_timeline(self, username, earliest_date):
        endpoint = 'timeline'
        self.__update_current_app(endpoint, 16)

        header = "user_id,tweet_id,created_at,text,coordinates,lang,retweet_count,favorite_count,in_reply_to_tweet_id,in_reply_to_user_id,place"
        w = wt.WriteTwitter('data/chapters/tweets/' + username + '_tweets.csv', header)

        args = {'screen_name': username,
                'count': 200,
                'trim_user': '******'}

        next_max_id = -999
        counter = 0
        is_last = False
        while not is_last:
            if counter > 0:
                args['max_id'] = next_max_id

            r = self.app[self.current_app_index].request('statuses/user_timeline', args)
            self.status[self.current_app_index][endpoint] = int(r.headers._store['x-rate-limit-remaining'][1])

            if r.status_code != 200:
                print r.status_code
                self.__update_current_app(endpoint, 16 - counter)
            else:
                next_max_id, is_last = w.write_tweets(r, earliest_date)

            counter += 1

            if self.status[self.current_app_index][endpoint] == 0:
                self.__update_current_app(endpoint, 1)

        w.close()
コード例 #2
0
    def paginate_search(self, max_id, hashtags):
        endpoint = 'search'
        self.__update_current_app(endpoint, 300)

        header = "user_id,tweet_id,created_at,text,coordinates,lang,retweet_count,favorite_count,in_reply_to_tweet_id,in_reply_to_user_id,place"
        w = wt.WriteTwitter('data/hashtags/' + str(max_id) + '_tweets.csv', header)

        html_hashtags = "%20OR%20%23".join(hashtags)
        html_hashtags = "%23" + html_hashtags
        args = {'q': html_hashtags,
                'count': 100,
                'max_id': max_id,
                'include_entities': 'false'}
        earliest_date = time.strptime('2000-01-01', '%Y-%m-%d')

        while self.status[self.current_app_index][endpoint] > 1:
            args['max_id'] = max_id

            r = self.app[self.current_app_index].request('search/tweets', args)
            self.status[self.current_app_index][endpoint] = int(r.headers._store['x-rate-limit-remaining'][1])

            if r.status_code != 200:
                print 'WARNING: status not 200!!!'
                self.__update_current_app(endpoint, 300)
            else:
                max_id, temp = w.write_tweets(r, earliest_date)

        w.close()
        return max_id
コード例 #3
0
    def paginate_anon_users(self, anon_map):
        header = 'anon_id,created_at,description,lang,location,time_zone,utc_offset,statuses_count,favourites_count,followers_count,friends_count,listed_count,contributors_enabled,protected,verified'

        with open(anon_map, 'r') as id_file:
            all_ids = id_file.read().splitlines()

        n_ids = len(all_ids)

        map = dict()
        for row in all_ids:
            tokens = row.split(',')
            num_id = long(tokens[0])
            map[num_id] = int(tokens[1])

        int_all_ids = map.keys()
        new_counter = 0
        file_name = 'data/anon_users_hashtag_tweets.csv'
        w = wt.WriteTwitter(file_name, header, datetime.date.today().strftime('%Y-%m-%d'))

        while new_counter < n_ids:
            old_counter = new_counter
            new_counter = min(new_counter + 100, n_ids)
            user_ids = ','.join(str(x) for x in int_all_ids[old_counter:new_counter])

            success = False
            while not success:
                success = self.ta.get_hydrated_anon_users(user_ids, w, map)

        w.close()
コード例 #4
0
    def get_followers_of_followers(self):
        input_files = glob.glob(os.getcwd() + '/crawlLists/in_list_[0-9][0-9][0-9][0-9]')
        input_files.sort()
        date_string = datetime.date.today().strftime('%Y-%m-%d')
        for this_file in input_files:
            n_successful = 0
            n_failed = 0
            file_number = this_file[-3:]

            w = wt.WriteTwitter('data/followersOfFollowers/out_list_' + file_number + '.csv', '', date_string)

            with open(this_file, 'r') as file_reader:
                all_lines = file.read(file_reader).splitlines()
                for line in all_lines:
                    try:
                        success = self.ta.paginate_followers(line, True, True, w, file_number)
                    except:
                        success = False
                        w.write_to_errlog(file_number, line, 'UNKNOWN')

                    if success:
                        n_successful += 1
                    else:
                        n_failed += 1
            w.write_to_log(file_number, n_successful, n_failed)
            w.close()
            os.system('mv ' + this_file + ' ' + this_file + '_done')
コード例 #5
0
    def __paginate_hydrated_users(self, in_list, has_header):
        file_number = in_list[-2:]

        header = 'user_id,name,screen_name,created_at,description,lang,location,url,time_zone,utc_offset,statuses_count,favourites_count,followers_count,friends_count,listed_count,contributors_enabled,protected,verified'
        with open(in_list, 'r') as id_file:
            if has_header:
                id_file.readline()
            all_ids = id_file.read().splitlines()

        n_ids = len(all_ids)
        new_counter = 0
        file_name = 'data/users/hydrated_out_list_' + file_number + '.csv'
        w = wt.WriteTwitter(file_name, header, datetime.date.today().strftime('%Y-%m-%d'))

        while new_counter < n_ids:
            old_counter = new_counter
            new_counter = min(new_counter + 100, n_ids)
            user_ids = ",".join(all_ids[old_counter:new_counter])

            success = False
            while not success:
                success = self.ta.get_hydrated_users(user_ids, w)

        w.close()
        os.system('mv ' + in_list + ' ' + in_list + '_done')
コード例 #6
0
    def paginate_retweeters(self, tweet_id, retweet_count):
        self.__update_current_app('retweeters', math.ceil(retweet_count / 100))

        w = wt.WriteTwitter('data/retweets/' + str(max_id) + '_tweets.csv', header)

        next_cursor = -1
        args = {'id': tweet_id,
                'cursor': next_cursor,
                'stringify_ids': 'true'}

        while self.status[self.current_app_index]["retweeters"] > 0 and next_cursor != 0:
            # args['max_id'] = max_id

            # r = self.app[self.current_app_index].request('search/tweets', args)
            self.status[self.current_app_index]["retweeters"] -= 1

            if r.status_code != 200:
                print 'WARNING: status not 200!!!'

            else:
                next_cursor = w.write_retweeters(r)

        w.close()
コード例 #7
0
 def get_chapter_followers(self):
     all_chapters = get_all_chapter_handles()
     for chapter in all_chapters:
         w = wt.WriteTwitter('data/followers/' + chapter + '_followers.csv', '')
         self.ta.paginate_followers(chapter, False, False, w, '')
         w.close()