def test_rate_limit(self, api, wait=True, buffer=.1): """ Tests whether the rate limit of the last request has been reached. :param api: The `tweepy` api instance. :param wait: A flag indicating whether to wait for the rate limit reset if the rate limit has been reached. :param buffer: A buffer time in seconds that is added on to the waiting time as an extra safety margin. :return: True if it is ok to proceed with the next request. False otherwise. """ # Get the number of remaining requests remaining = int(api.last_response.headers['x-rate-limit-remaining']) # Check if we have reached the limit if remaining == 0: limit = int(api.last_response.headers['x-rate-limit-limit']) reset = int(api.last_response.headers['x-rate-limit-reset']) # Parse the UTC time reset = datetime.fromtimestamp(reset) # Let the user know we have reached the rate limit log.info("0 of %d requests remaining until %d.", limit, reset) if wait: # Determine the delay and sleep delay = (reset - datetime.now()).total_seconds() + buffer log.info("Sleeping for %d", delay) sleep(delay) # We have waited for the rate limit reset. OK to proceed. return True else: # We have reached the rate limit. The user needs to handle the # rate limit manually. return False # We have not reached the rate limit return True
def save_tweet(self, twitter) -> None: """Save tweet data into database Will check if data is not exists then save it, if exists ignore it Args: twitter: tweepy status object """ if isinstance(twitter, dict): json_data = twitter else: json_data = json.loads(twitter) try: breakpoint() self.db.tweets.find_one_and_update( {'id_str': json_data['id_str']}, {'$inc': { 'seq': 1 }}, projection={ 'seq': True, '_id': False }, upsert=True, ) except Exception as e: log.error(e)
def main(verbose: bool = False, config: str = None) -> None: """ An entry point for twitter consumer """ loglevel = 'DEBUG' if verbose else 'INFO' LOG.setLevel(loglevel) config_data = None if config: config_data = read_config(config) consumer = TweetConsumer(config_data) consumer.execute()
def main(keywords: List[str], access_token: str, access_token_secret: str, config: str = None, api_type: str = None, token: int = 0, verbose: bool = False): """ An entry point to twitter crawler application """ loglevel = 'DEBUG' if verbose else 'INFO' LOG.setLevel(loglevel) LOG.info(msg=f"Argument {config} {api_type}") crawler_config = None if config: crawler_config = read_config(config) if keywords and access_token and access_token_secret: crawler_config = construct_config(keywords, access_token, access_token_secret) if crawler_config: LOG.debug(crawler_config) LOG.debug(f"Api Type - {api_type}") crawler = Crawler.create_crawler_instance(api_type, crawler_config, int(token)) crawler.execute() click.echo("Option is required")
def on_data(self, data): """ Method to passes data from statuses to the on_status method""" if 'in_reply_to_status' in data: self.on_status(data) elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False elif 'warning' in data: warning = json.loads(data)['warnings'] log.warning(warning['message']) return False
def execute(self): """Execute the twitter crawler, loop into the keyword_list""" while True: log.info("Start crawling back....") delay = 600 for keyword in self.keyword_list: log.info('Crawl data for %s', keyword["keyword"]) try: self.crawl(keyword) except Exception as e: log.error('Error in Crawling process', exc_info=True) log.info("Sleeping for %ds", delay) sleep(delay) # Sleep for 10 minutes after finishing crawl all of the keyword, # and start over again log.info("Sleeping for %ds...", delay) sleep(delay)
def save_tweet(self, twitter): """Save tweet data into database Will check if data is not exists then save it, if exists ignore it Args: twitter: tweepy status object """ if isinstance(twitter, dict): json_data = twitter else: json_data = json.loads(twitter) parsed_tweet = self.parse_tweet(json_data) doc = self.dbase.get(parsed_tweet["id"]) if doc is None: try: self.dbase.save(parsed_tweet) except Exception as e: log.error(e)
def get(self, size: int) -> List: tweets = [] count = 0 for method_frame, properties, body in self.channel.consume( self.queue_name): tweets.append(json.loads(body)) # Acknowledge the message self.channel.basic_ack(method_frame.delivery_tag) count += 1 if count == size: break requeued_messages = self.channel.cancel() log.info(f'Requeued {requeued_messages} messages') return tweets
def execute(self): """Execute the twitter crawler, loop into the keyword_list """ stream = tweepy.Stream(self.auth, self.listener) loop = True while loop: try: log.info("Start stream tweets data") log.info(f"Area for stream -> {self.bounding_box}", ) stream.filter(track=self.keywords, locations=[94.9, -8.88, 140.9, 5.86]) loop = False log.info("End stream tweets data") except Exception as e: log.error("There's an error, sleep for 10 minutes") log.error(e) loop = True stream.disconnect() time.sleep(600) continue
def on_status(self, status): """ Handle logic when the data coming """ try: tweet = json.loads(status) self.tweets += 1 self.messaging.publish(tweet) log.info(f"Count {self.tweets}") except Exception as e: log.error(e) log.error(status) self.on_timeout()
def execute(self): try: while True: tweets = self.messaging.get(size=50) # If there's not tweets, take a 2 mins sleep if len(tweets) == 0: log.info("Waiting for 2mins on next tweet batch") time.sleep(120) for tweet in tweets: self.store.save_tweet(tweet) log.info(f"Successfully store tweet: {tweet['id_str']}") except Exception as e: log.error(e)
def on_timeout(self): """ Handle time out when API reach its limit """ log.info("API Reach its limit, sleep for 10 minutes") time.sleep(600) return