def update_tweets(tweet_batch): if not tweet_batch.count(): return tweet_batch = list(tweet_batch) client = get_client('statuses_lookup') try: responses = client.call('statuses_lookup', id_=[tweet._ident for tweet in tweet_batch], trim_user=True) except tweepy.error.TweepError: log('got tweepy.error.TweepError!') log('tweet ids = %s' % [tweet._ident for tweet in tweet_batch]) return_client(client) raise return_client(client) for response in responses: monitor_stop_flag() tweet = next( (tweet for tweet in tweet_batch if tweet._ident == response._json['id']), None) if tweet: global_task_queue.add(update_tweet_from_response, [response]) tweet_batch.remove(tweet) deleted_count = 0 for tweet in tweet_batch: deleted_count += 1 tweet.deleted_at = today() tweet.save() if deleted_count > 0: log("{} tweets have been deleted".format(deleted_count))
def _fetch_tweets_from_html(term, since, until): monitor_stop_flag() url = 'https://twitter.com/search?q={} since%3A{} until%3A{}'.format( term, since.strftime("%Y-%m-%d"), until.strftime("%Y-%m-%d")) log(url) request = Request(url, headers={ 'User-Agent': random.choice(BROWSER_USER_AGENTS), 'Host': 'twitter.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,fr-CA;q=0.8,en;q=0.5,fr;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://twitter.com/', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0, no-cache', 'TE': 'Trailers', 'Pragma': 'no-cache', }) try: data = urlopen(request, timeout=5, context=ssl._create_unverified_context()) page = bs(data, "html.parser") except socket.timeout: log('Socket timeout while fetching tweets from hashtag: #{}'.format(term)) safe_sleep(1) return _fetch_tweets_from_html(term, since, until) tweets = page.find_all('li', {"data-item-type": "tweet"}) tweet_list = [int(tweet['data-item-id']) for tweet in tweets if tweet.has_attr('data-item-id')] return tweet_list
def update_twitter_users(twitter_user_batch): if not twitter_user_batch.count(): return twitter_user_batch = list(twitter_user_batch) client = get_client('lookup_users') try: responses = client.call( 'lookup_users', user_ids=[user._ident for user in twitter_user_batch]) except tweepy.error.TweepError: log('got tweepy.error.TweepError!') log('user_ids = %s' % [user._ident for user in twitter_user_batch]) return_client(client) raise return_client(client) for response in responses: monitor_stop_flag() tw_user = next((user for user in twitter_user_batch if user._ident == response._json['id']), None) if tw_user: global_task_queue.add(update_twitter_user_from_response, args=[tw_user, response._json]) twitter_user_batch.remove(tw_user) for tw_user in twitter_user_batch: log('Twitter user (%s) has returned no result.' % tw_user) # twUser._error_on_update = True tw_user._last_updated = today() tw_user._update_frequency = 5 tw_user.save()
def execute(self): while global_task_queue.empty(): monitor_stop_flag() self.current_task, self.current_args, self.current_kwargs = global_task_queue.get( ) if not self.current_task: return # log('Consuming task: %s' % self.current_task.__name__) self.current_task(*self.current_args, **self.current_kwargs) self.current_task = self.current_args = self.current_kwargs = None
def next(self): monitor_stop_flag() if self.index == -1: return None if self.index < self.nbItems: item = self.results[self.index] self.index += 1 return item else: self._get_next_set() return self.next()
def harvest_twitter_hashtag(twitter_hashtag_harvester): twitter_hashtag = twitter_hashtag_harvester.twitter_hashtag log(twitter_hashtag) while True: monitor_stop_flag() tweet_ids = _fetch_tweets_from_html( twitter_hashtag.term, twitter_hashtag_harvester.harvest_since, twitter_hashtag_harvester.harvest_until ) pretty(tweet_ids)
def get_client(call_name): client = None if not clients_queue.empty(): client = clients_queue.get() while not client or client.get_remaining_calls(call_name) <= 0: monitor_stop_flag() if client: clients_queue.put(client) client = None if not clients_queue.empty(): client = clients_queue.get() # client.pretty_limit_status() return client