Python Twarc.user_lookupの例

プログラミング言語: Python

名前空間/パッケージ名: twarc

クラス/型: Twarc

メソッド/関数: user_lookup

hotexamples.comのコード掲載数: 7

Python Twarc.user_lookup - 7件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtwarc.Twarc.user_lookupの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Twarc(30)

hydrate(28)

search(22)

timeline(13)

filter(9)

user_lookup(6)

tweet(5)

replies(4)

sample(2)

stream(2)

follower_ids(1)

get(1)

list_members(1)

コード例 #1

ファイルを表示

 def add_users_by_screen_names(self, screen_names):
     if 'keys' not in self:
         raise CollectionConfigException(
             'Keys are required to add users by screen name.')
     keys = self['keys']
     twarc = Twarc(keys['consumer_key'], keys['consumer_secret'],
                   keys['access_token'], keys['access_token_secret'])
     # Lower case to original case
     screen_name_case_map = {}
     for screen_name in screen_names:
         clean_screen_name = screen_name.lstrip('@')
         if clean_screen_name:
             screen_name_case_map[
                 clean_screen_name.lower()] = clean_screen_name
     if 'users' not in self:
         self['users'] = {}
     delete_users = []
     for user in twarc.user_lookup(screen_name_case_map.keys(),
                                   id_type='screen_name'):
         if user['id_str'] not in self['users']:
             self['users'][user['id_str']] = {
                 'screen_name': user['screen_name']
             }
         delete_users.append(user['screen_name'].lower())
     for screen_name in delete_users:
         del screen_name_case_map[screen_name]
     return screen_name_case_map.values()

コード例 #2

ファイルを表示

def overhear_conversation(graph=Graph("bolt://*****:*****@ssw0rd")),
                          USERS_PATH=Path('data/users/')):
    t = Twarc(credentials.CONSUMER_KEY, credentials.CONSUMER_SECRET,
              credentials.ACCESS_TOKEN, credentials.ACCESS_TOKEN_SECRET)
    user_ids = []
    # loop through csvs of usernames and grab them
    for doc in (USERS_PATH).glob('*.*'):
        with open(doc) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            next(csv_reader)
            for row in csv_reader:
                user_ids.append(row[1])

    # use twarc to call those usernames and grab their stats
    for user in t.user_lookup(user_ids):
        with open(USERS_PATH / 'user_stats.csv', 'a+') as csv_file:
            writer = csv.writer(csv_file, delimiter='|')
            writer.writerow([
                user['name'], user['screen_name'], user['followers_count'],
                user['statuses_count']
            ])

    # loop through the stats and store them in the graph
    with open(USERS_PATH / 'user_stats.csv', 'r') as file:
        csv_reader = csv.reader(file, delimiter='|')
        for row in csv_reader:
            u_param = {
                'surname': last_name(row[0]),
                'user_name': row[1],
                'follower_count': row[2],
                'statuses_count': row[3]
            }
            print(u_param)
            u_query = '''
	        MATCH (r:Rep {surname: $surname})
	        SET r.user_name=$user_name, r.followers=$follower_count, r.statuses=$statuses_count
	        '''
            graph.run(u_query, u_param)

コード例 #3

ファイルを表示

class TwitterHarvester(BaseHarvester):
    def __init__(self,
                 working_path,
                 stream_restart_interval_secs=30 * 60,
                 mq_config=None,
                 debug=False,
                 connection_errors=5,
                 http_errors=5,
                 debug_warcprox=False,
                 tries=3):
        BaseHarvester.__init__(
            self,
            working_path,
            mq_config=mq_config,
            stream_restart_interval_secs=stream_restart_interval_secs,
            debug=debug,
            debug_warcprox=debug_warcprox,
            tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options",
                                              {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get(
            "web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get(
            "user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(
            self.twarc.filter(track=track,
                              follow=follow,
                              locations=locations,
                              event=self.stop_harvest_seeds_event))

    def sample(self):
        self._harvest_tweets(self.twarc.sample(self.stop_harvest_seeds_event))

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug(
                "Processing seed (%s) with screen name %s and user id %s",
                seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {} because account is not found or suspended".format(
                        screen_name)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                # if can't find the screen_name, ignore get timeline
                if not new_screen_name:
                    msg = "Screen name not found for user id {} because account is not found or suspended".format(
                        user_id)
                    log.exception(msg)
                    self.result.warnings.append(
                        Msg(CODE_TOKEN_NOT_FOUND, msg, seed_id=seed_id))
                    # reset the user_id, ignore the get timeline
                    user_id = None
                if new_screen_name and new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(
                        __name__, "timeline.{}.since_id".format(
                            user_id)) if incremental else None

                    self._harvest_tweets(
                        self.twarc.timeline(user_id=user_id,
                                            since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        account = "user {} (User ID: {})".format(
                            screen_name, user_id
                        ) if screen_name else "user ID: {}".format(user_id)
                        msg = "Unauthorized for {} because account is suspended or protected".format(
                            account)
                        log.exception(msg)
                        self.result.warnings.append(
                            Msg(CODE_TOKEN_UNAUTHORIZED, msg, seed_id=seed_id))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        try:
            users = list(self.twarc.user_lookup(user_ids=(user_id, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["screen_name"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        try:
            users = list(self.twarc.user_lookup(screen_names=(screen_name, )))
            assert len(users) in (0, 1)
            if users:
                return users[0]["id_str"]
        except HTTPError as e:
            if e.response.status_code != 404:
                raise e
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(
                        url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(
            __name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query),
                                       max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(
                        __name__, key,
                        max(self.state_store.get_state(__name__, key),
                            tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(
                tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])

コード例 #4

ファイルを表示

                                            'user_created_at', 'verified', 'protected'])
        writer.writerows(user_map.values())


def clean_cell(cell):
    if cell and cell != '-':
        return cell.strip()
    return None


if __name__ == '__main__':
    user_map = {}
    for user in csv_iter(['Senate_Press_Galleries.csv', 'Senate_Periodical_Galleries.csv', 'Radio_and_Television.csv']):
        user_map[user['screen_name']] = user

    for user_json in t.user_lookup(screen_names=user_map.keys()):
        user = user_map.get(user_json['screen_name'].lower())
        if user:
            user['user_id'] = user_json['id_str']
            user['followers_count'] = user_json['followers_count']
            user['following_count'] = user_json['friends_count']
            user['tweet_count'] = user_json['statuses_count']
            user['user_created_at'] = user_json['created_at']
            user['verified'] = user_json['verified']
            user['protected'] = user_json['protected']

    # Remove entries without match
    screen_names = list(user_map.keys())
    for screen_name in screen_names:
        if 'user_id' not in user_map[screen_name]:
            del user_map[screen_name]

コード例 #5

ファイルを表示

        print(result)
    return result

t = Twarc(credentials.CONSUMER_KEY,credentials.CONSUMER_SECRET, credentials.ACCESS_TOKEN,credentials.ACCESS_TOKEN_SECRET)
USERS_PATH = Path('data/users/')
user_ids=[]
# loop through csvs of usernames and grab them
for doc in (USERS_PATH).glob('*.*'):
    with open(doc) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        next(csv_reader)
        for row in csv_reader:
            user_ids.append(row[1])

# use twarc to call those usernames and grab their stats
for user in t.user_lookup(user_ids):
    with open(USERS_PATH/'user_stats.csv','a+') as csv_file:
        writer = csv.writer(csv_file,delimiter='|')
        writer.writerow([user['name'],user['screen_name'],user['followers_count'],user['statuses_count']])

# loop through the stats and store them in the graph
with open(USERS_PATH/'user_stats.csv','r') as file:
    csv_reader = csv.reader(file, delimiter='|')
    for row in csv_reader:
        u_param = {
            'surname': last_name(row[0]),
            'user_name': row[1],
            'follower_count': row[2],
            'statuses_count': row[3]
        }
        print(u_param)

コード例 #6

ファイルを表示

def user_info_crawler(screen_name, user_dir, user_profile_f, user_profileimg_f, user_tweets_f, user_clean_tweets_f):
    try:
        # crawl user profile
        # sys.stdout.write('Get user profile >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_profile_f)):

            t = Twarc(consumer_key, consumer_secret, access_token, access_token_secret)

            user_profile_data = t.user_lookup(ids=[screen_name], id_type="screen_name")

            for user_profile in user_profile_data:
                with open(os.path.join(user_dir, user_profile_f), 'w') as outfile:
                    json.dump(user_profile, outfile)

        # crawl user profile image
        # sys.stdout.write('Get user profile image >> ')
        # sys.stdout.flush()

        with open(os.path.join(user_dir, user_profile_f), 'r') as rf:

            user_profile_json = json.load(rf)

            if not os.path.exists(os.path.join(user_dir, user_profileimg_f)):

                # extract user profile image url
                user_profileimg_url = user_profile_json['profile_image_url']

                def image_converter(user_profileimg_url):
                    tmp_file = '../data/user/tmp' + user_profileimg_url[-4:]
                    if sys.version_info[0] == 2:
                        urllib.urlretrieve(user_profileimg_url, tmp_file)
                    elif sys.version_info[0] == 3:
                        urlretrieve(user_profileimg_url, tmp_file)
                    from PIL import Image
                    im = Image.open(tmp_file)
                    rgb_im = im.convert('RGB')
                    rgb_im.save(os.path.join(user_dir, user_profileimg_f))
                    os.remove(tmp_file)

                if user_profileimg_url:
                    user_profileimg_url = user_profileimg_url.replace('_normal', '_bigger')

                image_converter(user_profileimg_url)

        # crawl user tweets
        # sys.stdout.write('Get user tweets >> ')
        # sys.stdout.flush()

        if not os.path.exists(os.path.join(user_dir, user_tweets_f)):
            user_timeline_data = t.timeline(screen_name=screen_name)
            with open(os.path.join(user_dir, user_tweets_f), 'a') as outfile:
                for user_timeline in user_timeline_data:
                    json.dump(user_timeline, outfile)
                    outfile.write('\n')

        # clean user tweets
        # sys.stdout.write('Clean user tweets \n')
        # sys.stdout.flush()
        if not os.path.exists(os.path.join(user_dir, user_clean_tweets_f)):

            tweet_raw_lines = []
            with open(os.path.join(user_dir, user_tweets_f), 'r') as rf:
                for line in rf:
                    tweet_raw_lines.append(json.loads(line)['full_text'])

            clean_tweets = process_raw_tweets(tweet_raw_lines)

            with open(os.path.join(user_dir, user_clean_tweets_f), 'w') as wf:
                for tweet in clean_tweets:
                    if len(tweet) > 0:
                        wf.write(tweet + '\n')
            wf.close()

        return user_profile_json

    except Exception as e:
        # print(e)
        print("Could not predict user's role. Check account info, few tweets, incorrect image format...")

コード例 #7

ファイルを表示

ファイル: twitter_harvester.py プロジェクト: gwu-libraries/sfm-twitter-harvester

class TwitterHarvester(BaseHarvester):
    def __init__(self, working_path, stream_restart_interval_secs=30 * 60, mq_config=None, debug=False,
                 connection_errors=5, http_errors=5, debug_warcprox=False, tries=3):
        BaseHarvester.__init__(self, working_path, mq_config=mq_config,
                               stream_restart_interval_secs=stream_restart_interval_secs,
                               debug=debug, debug_warcprox=debug_warcprox, tries=tries)
        self.twarc = None
        self.connection_errors = connection_errors
        self.http_errors = http_errors
        self.extract_media = False
        self.extract_web_resources = False
        self.extract_user_profile_images = False

    def harvest_seeds(self):
        # Create a twarc
        self._create_twarc()

        # Get harvest extract options.
        self.extract_media = self.message.get("options", {}).get("media", False)
        self.extract_web_resources = self.message.get("options", {}).get("web_resources", False)
        self.extract_user_profile_images = self.message.get("options", {}).get("user_images", False)

        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.search()
        elif harvest_type == "twitter_filter":
            self.filter()
        elif harvest_type == "twitter_sample":
            self.sample()
        elif harvest_type == "twitter_user_timeline":
            self.user_timeline()
        else:
            raise KeyError

    def _create_twarc(self):
        self.twarc = Twarc(self.message["credentials"]["consumer_key"],
                           self.message["credentials"]["consumer_secret"],
                           self.message["credentials"]["access_token"],
                           self.message["credentials"]["access_token_secret"],
                           http_errors=self.http_errors,
                           connection_errors=self.connection_errors)

    def search(self):
        assert len(self.message.get("seeds", [])) == 1

        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        self._harvest_tweets(self.twarc.search(query, since_id=since_id))

    def filter(self):
        assert len(self.message.get("seeds", [])) == 1

        track = self.message["seeds"][0]["token"].get("track")
        follow = self.message["seeds"][0]["token"].get("follow")
        locations = self.message["seeds"][0]["token"].get("locations")

        self._harvest_tweets(self.twarc.filter(track=track, follow=follow, locations=locations))

    def sample(self):
        self._harvest_tweets(self.twarc.sample())

    def user_timeline(self):
        incremental = self.message.get("options", {}).get("incremental", False)

        for seed in self.message.get("seeds", []):
            seed_id = seed["id"]
            screen_name = seed.get("token")
            user_id = seed.get("uid")
            log.debug("Processing seed (%s) with screen name %s and user id %s", seed_id, screen_name, user_id)
            assert screen_name or user_id

            # If there is not a user_id, look it up.
            if screen_name and not user_id:
                user_id = self._lookup_user_id(screen_name)
                if user_id:
                    # Report back if nsid found
                    self.result.uids[seed_id] = user_id
                else:
                    msg = "User id not found for user {}".format(screen_name)
                    log.exception(msg)
                    self.result.warnings.append(Msg(CODE_TOKEN_NOT_FOUND, msg))
            # Otherwise, get the current screen_name
            else:
                new_screen_name = self._lookup_screen_name(user_id)
                if new_screen_name != screen_name:
                    self.result.token_updates[seed_id] = new_screen_name
                    screen_name = new_screen_name

            if user_id:
                try:
                    # Get since_id from state_store
                    since_id = self.state_store.get_state(__name__,
                                                          "timeline.{}.since_id".format(
                                                              user_id)) if incremental else None

                    self._harvest_tweets(self.twarc.timeline(user_id=user_id, since_id=since_id))

                except HTTPError as e:
                    if e.response.status_code == 401:
                        msg = "Unauthorized for user {} (User ID: {}) because account is suspended or private".format(screen_name, user_id)
                        log.exception(msg)
                        self.result.warnings.append(Msg(CODE_TOKEN_UNAUTHORIZED, msg))
                    else:
                        raise e

    def _lookup_screen_name(self, user_id):
        """
        Lookup a screen name given a user id.
        """
        users = list(self.twarc.user_lookup(user_ids=(user_id,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["screen_name"]
        return None

    def _lookup_user_id(self, screen_name):
        """
        Lookup a user id given a screen name.
        """
        users = list(self.twarc.user_lookup(screen_names=(screen_name,)))
        assert len(users) in (0, 1)
        if users:
            return users[0]["id_str"]
        return None

    def _harvest_tweets(self, tweets):
        # max_tweet_id = None
        for count, tweet in enumerate(tweets):
            if not count % 100:
                log.debug("Harvested %s tweets", count)
            self.result.harvest_counter["tweets"] += 1
            if self.stop_harvest_seeds_event.is_set():
                log.debug("Stopping since stop event set.")
                break

    def _process_entities(self, entities):
        if self.extract_web_resources:
            for url in entities.get("urls", []):
                # Exclude links for tweets
                if url["expanded_url"] and not status_re.match(url["expanded_url"]):
                    self.result.urls.append(url["expanded_url"])
        if self.extract_media:
            for media in entities.get("media", []):
                if media["media_url"]:
                    self.result.urls.append(media["media_url"])

    def process_warc(self, warc_filepath):
        # Dispatch message based on type.
        harvest_type = self.message.get("type")
        log.debug("Harvest type is %s", harvest_type)
        if harvest_type == "twitter_search":
            self.process_search_warc(warc_filepath)
        elif harvest_type == "twitter_filter":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_sample":
            self._process_tweets(TwitterStreamWarcIter(warc_filepath))
        elif harvest_type == "twitter_user_timeline":
            self.process_user_timeline_warc(warc_filepath)
        else:
            raise KeyError

    def process_search_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)
        query = self.message["seeds"][0]["token"]

        since_id = self.state_store.get_state(__name__, u"{}.since_id".format(query)) if incremental else None

        max_tweet_id = self._process_tweets(TwitterRestWarcIter(warc_filepath))

        # Update state store
        if incremental and max_tweet_id > since_id:
            self.state_store.set_state(__name__, u"{}.since_id".format(query), max_tweet_id)

    def process_user_timeline_warc(self, warc_filepath):
        incremental = self.message.get("options", {}).get("incremental", False)

        for count, status in enumerate(TwitterRestWarcIter(warc_filepath)):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                user_id = tweet["user"]["id_str"]
                if incremental:
                    # Update state
                    key = "timeline.{}.since_id".format(user_id)
                    self.state_store.set_state(__name__, key,
                                               max(self.state_store.get_state(__name__, key), tweet.get("id")))
                self._process_tweet(tweet)

    def _process_tweets(self, warc_iter):
        max_tweet_id = None
        for count, status in enumerate(warc_iter):
            tweet = status.item
            if not count % 100:
                log.debug("Processing %s tweets", count)
            if "text" in tweet:
                max_tweet_id = max(max_tweet_id, tweet.get("id"))
                self._process_tweet(tweet)
        return max_tweet_id

    def _process_tweet(self, tweet):
        self.result.increment_stats("tweets")
        # For more info, see https://dev.twitter.com/overview/api/entities-in-twitter-objects
        statuses = [tweet]
        if "retweeted_status" in tweet:
            statuses.append(tweet["retweeted_status"])
        elif "quoted_status" in tweet:
            statuses.append(tweet["quoted_status"])
        for status in statuses:
            self._process_entities(status.get("entities", {}))
            self._process_entities(status.get("extended_entities", {}))
        if self.extract_user_profile_images:
            self.result.urls.append(tweet["user"]["profile_image_url"])
            self.result.urls.append(tweet["user"]["profile_background_image_url"])
            if "profile_banner_url" in tweet["user"]:
                self.result.urls.append(tweet["user"]["profile_banner_url"])