def test_rate_limit(self, api, wait=True, buffer=.1): """ Tests whether the rate limit of the last request has been reached. :param api: The `tweepy` api instance. :param wait: A flag indicating whether to wait for the rate limit reset if the rate limit has been reached. :param buffer: A buffer time in seconds that is added on to the waiting time as an extra safety margin. :return: True if it is ok to proceed with the next request. False otherwise. """ # Get the number of remaining requests remaining = int(api.last_response.headers['x-rate-limit-remaining']) # Check if we have reached the limit if remaining == 0: limit = int(api.last_response.headers['x-rate-limit-limit']) reset = int(api.last_response.headers['x-rate-limit-reset']) # Parse the UTC time reset = datetime.fromtimestamp(reset) # Let the user know we have reached the rate limit log.info("0 of %d requests remaining until %d.", limit, reset) if wait: # Determine the delay and sleep delay = (reset - datetime.now()).total_seconds() + buffer log.info("Sleeping for %d", delay) sleep(delay) # We have waited for the rate limit reset. OK to proceed. return True else: # We have reached the rate limit. The user needs to handle the # rate limit manually. return False # We have not reached the rate limit return True
def update_timestamp_data(self, timestamp): try: update_statement = self.data.update().where( self.data.c.data_timestamp == timestamp).values(status=True) self.con.execute(update_statement) except Exception as e: log.error(e)
def on_status(self, status): """ Handle logic when the data coming """ try: tweet = json.loads(status) # Update sentiment score tweet["sentiment"] = SentimentAnalysis.get_sentiment(tweet_text=tweet["text"]) self.tw_store.save_tweet(tweet) except Exception as e: log.error(e)
def on_data(self, data): """ Method to passes data from statuses to the on_status method""" if 'in_reply_to_status' in data: self.on_status(data) elif 'delete' in data: delete = json.loads(data)['delete']['status'] if self.on_delete(delete['id'], delete['user_id']) is False: return False elif 'limit' in data: if self.on_limit(json.loads(data)['limit']['track']) is False: return False elif 'warning' in data: warning = json.loads(data)['warnings'] log.warning(warning['message']) return False
def execute(self): """Execute the twitter crawler, loop into the keyword_list""" while True: log.info("Star crawling back....") delay = 600 for keyword in self.keyword_list: log.info('Crawl data for %s', keyword["keyword"]) try: self.crawl(keyword) except Exception as e: log.error('Error in Crawling process', exc_info=True) log.info("Sleeping for %ds", delay) sleep(delay) # Sleep for 10 minutes after finishing crawl all of the keyword, # and start over again log.info("Sleeping for %ds...", delay) sleep(delay)
def connect(self): '''Connect to database then set con and meta attributes''' # We connect with the help of the PostgreSQL URL # postgresql://federer:grandestslam@localhost:5432/tennis url = 'postgresql://{}:{}@{}:{}/{}' url = url.format(self.user, self.password, self.host, self.port, self.database_name) try: # The return value of create_engine() is our connection object con = sqlalchemy.create_engine(url, client_encoding='utf8') # We then bind the connection to MetaData() meta = sqlalchemy.MetaData(bind=con, reflect=True) except Exception as ex: log.error(ex) return False self.con = con self.meta = meta return True
def execute(self): """Execute the twitter crawler, loop into the keyword_list """ listen = TwitterStream(self.tw_store) stream = tweepy.Stream(self.auth, listen) loop = True while loop: try: log.info("Start stream tweets data") stream.filter(locations=AUS_GEO_CODE) loop = False log.info("End stream tweets data") except Exception as e: log.error("There's an error, sleep for 10 minutes") log.error(e) loop = True stream.disconnect() time.sleep(600) continue
def on_timeout(self): """ Handle time out when API reach its limit """ log.info("API Reach its limit, sleep for 10 minutes") time.sleep(600) return
except: hashtags = server["twitter-hashtags"] # get twitter-words couchdb instance try: words = server.create["twitter-words"] except: words = server["twitter-words"] # get twitter-users couchdb instance try: user = server.create["twitter-users"] except: user = server["twitter-users"] log.info("START - Processing analytics data") analytic_db = AnalyticsLog(database.con, database.meta) date_list = analytic_db.fetch_unprocessed_data() # fetch data for individual date for date_for_analysis in date_list: log.info("START - Process data for %s", date_for_analysis) view_data = [] for row in db.view('_design/analytics/_view/tweets-victoria',\ startkey=date_for_analysis, endkey=date_for_analysis): view_data.append(row.value) log.info("Processing %d row of data", len(view_data))
import math import couchdb from app.sentiment_analysis import SentimentAnalysis from app.logger import LOGGER as log import settings ALL_DOCS_VIEW = '_all_docs' try: log.info("START db updater script") log.info("-----------------------") server = couchdb.Server(url=settings.COUCHDB_SERVER) db = server[settings.COUCHDB_DB] info = db.info() doc_count = info["doc_count"] num_per_request = 10000 iteration = math.ceil(doc_count / num_per_request) for i in range(iteration): log.info('Run %d iteration' % i) for row in db.view(ALL_DOCS_VIEW, limit=num_per_request, skip=i * num_per_request): data = db.get(row.id) data["sentiment"] = SentimentAnalysis.get_sentiment(data["text"]) db.save(data) log.info('%d iteration success')
def get_sentiment(tweet_text): try: analyzer = SentimentIntensityAnalyzer() return analyzer.polarity_scores(tweet_text) except Exception as e: log.error(e)