def doDefaultScrape(self, latest=False, credentials=False): """Retrieve the tweets, friends or followers of trhe next users in the default scrape.""" keep_going = cache.get('default_scrape_' + self.request.root_id) if (not keep_going) or keep_going.decode('utf-8') != 'true': logger.info('*** STOPPED DEFAULT SCRAPE ***') return False logger.info('*** SCRAPING... ***') this_friend = cache.get('scrape_friends_' + self.request.root_id) if (not this_friend) or this_friend.decode('utf-8') == 'done': cache.set('scrape_friends_' + self.request.root_id, 'running') getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, cacheKey='scrape_friends_' + self.request.root_id) else: logger.info('*** FRIENDS BUSY ***') this_follower = cache.get('scrape_followers_' + self.request.root_id) if (not this_follower) or this_follower.decode('utf-8') == 'done': cache.set('scrape_followers_' + self.request.root_id, 'running') getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, friends=False, cacheKey='scrape_followers_' + self.request.root_id) else: logger.info('*** FOLLOWERS BUSY ***') this_tweet = cache.get('scrape_tweets_' + self.request.root_id) if (not this_tweet) or this_tweet.decode('utf-8') == 'done': cache.set('scrape_tweets_' + self.request.root_id, 'running') getTweets.delay(whoNext('tweets', latest=latest), maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id) else: logger.info('*** TWEETS BUSY ***') doDefaultScrape.apply_async(kwargs={'latest': latest}, credentials=credentials, countdown=30)
def can_we_do_that(self, method_name): """Check whether a given API call is rate-limited, return the estimated time to wait in seconds. Positional arguments: method_name -- the name of the API call to test """ # Have we recorded how many calls remain in the current window? try: keyval = cache.get(self.handle + method_name).decode('utf-8') except: keyval = False # We've not made the call for these credentials. Assume all's well. if not keyval: return 0 else: history = json.loads(keyval) if history['limit'] > 0: # Still good to go. return 0 reset = datetime.strptime(history['reset'].split('.')[0], "%Y-%m-%dT%H:%M:%S") rightNow = datetime.now() # No calls left and the window reset is in the future... if reset > rightNow: # ...return the time to wait. return (reset - rightNow).seconds + 30 return 0
def doUserScrape(self, credentials=False): """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """ keep_going = cache.get('user_scrape_' + self.request.root_id) if (not keep_going) or keep_going.decode('utf-8') != 'true': logger.info('*** STOPPED USER SCRAPE ***') # mark crawl as stopped on crawl node db = get_neo_driver() update_crawl(db, crawl_task=self.request.root_id, status='done') db.close() return False user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8') logger.info('*** SCRAPING USER: %s... ***' % (user,)) this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8') if (not this_friend) or this_friend == 'done': db = get_neo_driver() next_friends = nextNearest(db, user, 'friends', self.request.root_id) db.close() if next_friends: cache.set('scrape_friends_' + self.request.root_id, 'running') getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id) else: logger.info('*** FRIENDS BUSY ***') this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8') if (not this_follower) or this_follower == 'done': db = get_neo_driver() next_followers = nextNearest(db, user, 'followers', self.request.root_id) db.close() if next_followers: cache.set('scrape_followers_' + self.request.root_id, 'running') getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id) else: logger.info('*** FOLLOWERS BUSY ***') this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8') if (not this_tweet) or this_tweet == 'done': db = get_neo_driver() next_tweets = nextNearest(db, user, 'tweets', self.request.root_id) db.close() if next_tweets: cache.set('scrape_tweets_' + self.request.root_id, 'running') getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id) else: logger.info('*** TWEETS BUSY ***') if 'running' in [cache.get(k).decode('utf-8') for k in ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id, 'scrape_tweets_' + self.request.root_id]]: doUserScrape.apply_async(countdown=30, credentials=credentials) else: cache.set('user_scrape_' + self.request.root_id, 'false') cache.set('scrape_mode_' + self.request.root_id, '') logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
def nextNearest(db, user, job, root_task, max_friends=2000, max_followers=2000, limit=20, max_tweets=2000, test=False): """Find the next user to retrieve friends, followers or tweets, closest to a given user.""" cacheKey = '_'.join(['nextnearest', job, user, root_task]) nextUserDump = cache.get(cacheKey).decode('utf-8') next_users = False if nextUserDump: try: next_users = json.loads(nextUserDump) except: next_users = [] if next_users: logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) + ' from ' + user + ' ***') next_user = next_users.pop(0) cache.set(cacheKey, json.dumps(next_users)) return next_user query_str = "MATCH (a:twitter_user {{screen_name: '{}'}})-[:FOLLOWS]-(d:twitter_user)".format( user) query_str += ' MATCH (b:twitter_user)-[:FOLLOWS]-(d) WITH DISTINCT b ' if job == 'friends': query_str += 'MATCH (b)-[:FOLLOWS]->(c:twitter_user) ' if job == 'followers': query_str += 'MATCH (b)<-[:FOLLOWS]-(c:twitter_user) ' if job == 'tweets': query_str += 'MATCH (b)-[:TWEETED]->(c:tweet) ' query_str += 'WITH b, COUNT(c) AS n ' query_str += 'WHERE b.friends_count < {} AND b.followers_count < {} ' \ 'AND NOT EXISTS (b.protected) AND NOT EXISTS (b.defunct) '.format(max_friends, max_followers) if job == 'friends': query_str += 'AND n < b.friends_count/2 ' if job == 'followers': query_str += 'AND n < b.followers_count/2 ' if job == 'tweets': query_str += 'AND b.statuses_count > 0 AND n < b.statuses_count/2 AND n<{} '.format( max_tweets) query_str += 'RETURN b.screen_name ORDER BY b.{}_last_scraped LIMIT {}'.format( job, limit) logging.info('*** Looking for ' + job + ' for ' + user + ' ***') if test: return query_str query = query_str try: with db.session() as session: with session.begin_transaction() as tx: result = tx.run(query) next_users = [record.values()[0] for record in result] except: next_users = [] if next_users: logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) + ' from ' + user + ' ***') next_user = next_users.pop(0) cache.set(cacheKey, json.dumps(next_users)) return next_user else: logging.info('No more ' + job + ' for ' + user) return False
def stop_stream(self, task_id): # stop the stream running in the stream started by the given task id logger.info('***Stopping twitter filter streamer ***') stream_id = cache.get("stream_id_" + task_id).decode('utf-8') revoke(stream_id, terminate=True)
def stop_scrape(self, task_id): '''Stop an excuting scrape on the next loop.''' scrape_mode = cache.get('scrape_mode_' + task_id) if scrape_mode: scrape_mode = scrape_mode.decode('utf-8') cache.set(scrape_mode + '_scrape_' + task_id, 'false')
def connected(self): key = 'stream_' + self.stream_id + '_connected' return cache.get(key)