Esempio n. 1
0
def doDefaultScrape(self, latest=False, credentials=False):
    """Retrieve the tweets, friends or followers of trhe next users in the default scrape."""
    keep_going = cache.get('default_scrape_' + self.request.root_id)
    if (not keep_going) or keep_going.decode('utf-8') != 'true':
        logger.info('*** STOPPED DEFAULT SCRAPE ***') 
        return False
    
    logger.info('*** SCRAPING... ***')

    this_friend = cache.get('scrape_friends_' + self.request.root_id)
    if (not this_friend) or this_friend.decode('utf-8') == 'done':
        cache.set('scrape_friends_' + self.request.root_id, 'running')
        getTwitterConnections.delay(whoNext('friends', latest=latest),  credentials=credentials, cacheKey='scrape_friends_' + self.request.root_id)
    else:
        logger.info('*** FRIENDS BUSY ***')

    this_follower = cache.get('scrape_followers_' + self.request.root_id)
    if (not this_follower) or this_follower.decode('utf-8') == 'done':
        cache.set('scrape_followers_' + self.request.root_id, 'running')
        getTwitterConnections.delay(whoNext('friends', latest=latest), credentials=credentials, friends=False, cacheKey='scrape_followers_' + self.request.root_id)
    else:
        logger.info('*** FOLLOWERS BUSY ***')

    this_tweet = cache.get('scrape_tweets_' + self.request.root_id)
    if (not this_tweet) or this_tweet.decode('utf-8') == 'done':
        cache.set('scrape_tweets_' + self.request.root_id, 'running')
        getTweets.delay(whoNext('tweets', latest=latest), maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id)
    else:
        logger.info('*** TWEETS BUSY ***')
                    
    doDefaultScrape.apply_async(kwargs={'latest': latest}, credentials=credentials, countdown=30)
Esempio n. 2
0
    def can_we_do_that(self, method_name):
        """Check whether a given API call is rate-limited, return the estimated time to wait in seconds.
    
        Positional arguments:
        method_name -- the name of the API call to test    
        """

        # Have we recorded how many calls remain in the current window?
        try:
            keyval = cache.get(self.handle + method_name).decode('utf-8')
        except:
            keyval = False
        # We've not made the call for these credentials. Assume all's well.
        if not keyval:
            return 0
        else:
            history = json.loads(keyval)
            if history['limit'] > 0:
                # Still good to go.
                return 0
            reset = datetime.strptime(history['reset'].split('.')[0],
                                      "%Y-%m-%dT%H:%M:%S")
            rightNow = datetime.now()
            # No calls left and the window reset is in the future...
            if reset > rightNow:
                # ...return the time to wait.
                return (reset - rightNow).seconds + 30
            return 0
Esempio n. 3
0
def doUserScrape(self, credentials=False):
    """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """
    keep_going = cache.get('user_scrape_' + self.request.root_id)
    if (not keep_going) or keep_going.decode('utf-8') != 'true':
        logger.info('*** STOPPED USER SCRAPE ***')
        # mark crawl as stopped on crawl node
        db = get_neo_driver()
        update_crawl(db, crawl_task=self.request.root_id, status='done')
        db.close()
        return False

    user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8')
    logger.info('*** SCRAPING USER: %s... ***' % (user,))

    this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8')
    if (not this_friend) or this_friend == 'done':
        db = get_neo_driver()
        next_friends = nextNearest(db, user, 'friends', self.request.root_id)
        db.close()
        if next_friends:
            cache.set('scrape_friends_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id)
    else:
        logger.info('*** FRIENDS BUSY ***')

    this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8')
    if (not this_follower) or this_follower == 'done':
        db = get_neo_driver()
        next_followers = nextNearest(db, user, 'followers', self.request.root_id)
        db.close()
        if next_followers:
            cache.set('scrape_followers_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id)
    else:
        logger.info('*** FOLLOWERS BUSY ***')

    this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8')
    if (not this_tweet) or this_tweet == 'done':
        db = get_neo_driver()
        next_tweets = nextNearest(db, user, 'tweets', self.request.root_id)
        db.close()
        if next_tweets:
            cache.set('scrape_tweets_' + self.request.root_id, 'running')
            getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id)
    else:
        logger.info('*** TWEETS BUSY ***')

    if 'running' in [cache.get(k).decode('utf-8') for k in
                     ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id,
                      'scrape_tweets_' + self.request.root_id]]:
        doUserScrape.apply_async(countdown=30, credentials=credentials)
    else:
        cache.set('user_scrape_' + self.request.root_id, 'false')
        cache.set('scrape_mode_' + self.request.root_id, '')
        logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
Esempio n. 4
0
def nextNearest(db,
                user,
                job,
                root_task,
                max_friends=2000,
                max_followers=2000,
                limit=20,
                max_tweets=2000,
                test=False):
    """Find the next user to retrieve friends, followers or tweets, closest to a given user."""
    cacheKey = '_'.join(['nextnearest', job, user, root_task])
    nextUserDump = cache.get(cacheKey).decode('utf-8')
    next_users = False
    if nextUserDump:
        try:
            next_users = json.loads(nextUserDump)
        except:
            next_users = []
    if next_users:
        logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) +
                     ' from ' + user + ' ***')
        next_user = next_users.pop(0)
        cache.set(cacheKey, json.dumps(next_users))
        return next_user

    query_str = "MATCH (a:twitter_user {{screen_name: '{}'}})-[:FOLLOWS]-(d:twitter_user)".format(
        user)
    query_str += ' MATCH (b:twitter_user)-[:FOLLOWS]-(d) WITH DISTINCT b '
    if job == 'friends':
        query_str += 'MATCH (b)-[:FOLLOWS]->(c:twitter_user) '
    if job == 'followers':
        query_str += 'MATCH (b)<-[:FOLLOWS]-(c:twitter_user) '
    if job == 'tweets':
        query_str += 'MATCH (b)-[:TWEETED]->(c:tweet) '
    query_str += 'WITH b, COUNT(c) AS n '
    query_str += 'WHERE b.friends_count < {} AND b.followers_count < {} ' \
                 'AND NOT EXISTS (b.protected) AND NOT EXISTS (b.defunct) '.format(max_friends, max_followers)
    if job == 'friends':
        query_str += 'AND n < b.friends_count/2 '
    if job == 'followers':
        query_str += 'AND n < b.followers_count/2 '
    if job == 'tweets':
        query_str += 'AND b.statuses_count > 0 AND n < b.statuses_count/2 AND n<{} '.format(
            max_tweets)
    query_str += 'RETURN b.screen_name ORDER BY b.{}_last_scraped LIMIT {}'.format(
        job, limit)

    logging.info('*** Looking for ' + job + ' for ' + user + ' ***')

    if test:
        return query_str

    query = query_str
    try:
        with db.session() as session:
            with session.begin_transaction() as tx:
                result = tx.run(query)
                next_users = [record.values()[0] for record in result]
    except:
        next_users = []

    if next_users:
        logging.info('*** NEXT ' + job + ': ' + ', '.join(next_users) +
                     ' from ' + user + ' ***')
        next_user = next_users.pop(0)
        cache.set(cacheKey, json.dumps(next_users))
        return next_user
    else:
        logging.info('No more ' + job + ' for ' + user)

    return False
Esempio n. 5
0
def stop_stream(self, task_id):

    # stop the stream running in the stream started by the given task id
    logger.info('***Stopping twitter filter streamer ***')
    stream_id = cache.get("stream_id_" + task_id).decode('utf-8')
    revoke(stream_id, terminate=True)
Esempio n. 6
0
def stop_scrape(self, task_id):
    '''Stop an excuting scrape on the next loop.'''
    scrape_mode = cache.get('scrape_mode_' + task_id)
    if scrape_mode:
        scrape_mode = scrape_mode.decode('utf-8')
        cache.set(scrape_mode + '_scrape_' + task_id, 'false')
Esempio n. 7
0
 def connected(self):
     key = 'stream_' + self.stream_id + '_connected'
     return cache.get(key)