Esempio n. 1
0
def doUserScrape(self, credentials=False):
    """Retrieve the next timelines, friends and followers for the next accounts in the user scrape. """
    keep_going = cache.get('user_scrape_' + self.request.root_id)
    if (not keep_going) or keep_going.decode('utf-8') != 'true':
        logger.info('*** STOPPED USER SCRAPE ***')
        # mark crawl as stopped on crawl node
        db = get_neo_driver()
        update_crawl(db, crawl_task=self.request.root_id, status='done')
        db.close()
        return False

    user = cache.get('scrape_user_' + self.request.root_id).decode('utf-8')
    logger.info('*** SCRAPING USER: %s... ***' % (user,))

    this_friend = cache.get('scrape_friends_' + self.request.root_id).decode('utf-8')
    if (not this_friend) or this_friend == 'done':
        db = get_neo_driver()
        next_friends = nextNearest(db, user, 'friends', self.request.root_id)
        db.close()
        if next_friends:
            cache.set('scrape_friends_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_friends, cacheKey='scrape_friends_' + self.request.root_id)
    else:
        logger.info('*** FRIENDS BUSY ***')

    this_follower = cache.get('scrape_followers_' + self.request.root_id).decode('utf-8')
    if (not this_follower) or this_follower == 'done':
        db = get_neo_driver()
        next_followers = nextNearest(db, user, 'followers', self.request.root_id)
        db.close()
        if next_followers:
            cache.set('scrape_followers_' + self.request.root_id, 'running')
            getTwitterConnections.delay(next_followers, friends=False, cacheKey='scrape_followers_' + self.request.root_id)
    else:
        logger.info('*** FOLLOWERS BUSY ***')

    this_tweet = cache.get('scrape_tweets_' + self.request.root_id).decode('utf-8')
    if (not this_tweet) or this_tweet == 'done':
        db = get_neo_driver()
        next_tweets = nextNearest(db, user, 'tweets', self.request.root_id)
        db.close()
        if next_tweets:
            cache.set('scrape_tweets_' + self.request.root_id, 'running')
            getTweets.delay(next_tweets, maxTweets=1000, credentials=credentials, cacheKey='scrape_tweets_' + self.request.root_id)
    else:
        logger.info('*** TWEETS BUSY ***')

    if 'running' in [cache.get(k).decode('utf-8') for k in
                     ['scrape_friends_' + self.request.root_id, 'scrape_followers_' + self.request.root_id,
                      'scrape_tweets_' + self.request.root_id]]:
        doUserScrape.apply_async(countdown=30, credentials=credentials)
    else:
        cache.set('user_scrape_' + self.request.root_id, 'false')
        cache.set('scrape_mode_' + self.request.root_id, '')
        logger.info('*** FINISHED SCRAPING USER: %s ***' % (user,))
Esempio n. 2
0
def cluster(seed, seed_type, query_name):
    logger.info('*** START CLUSTERING: seed %s, seed_type %s, query_name %s ***' % (seed, seed_type, query_name))
    if seed_type == 'twitter_user':
        seed_id_name = 'screen_name'
        if query_name == "TransFoF":
            query = twitterTransFofQuery(seed)
        elif query_name == 'FoF':
            query = twitterTransFofQuery(seed)
        else:
            logger.warn('*** CLUSTERING:  not yet implemented for seed type %s ***' % seed_type)
            return
    else:
        logger.warn('*** CLUSTERING:  not yet implemented for seed type %s ***' % seed_type)
        return

    db = get_neo_driver()

    logger.info('*** CLUSTERING: get matrix for seed %s ***' % seed)
    matrix_labels_and_results = twitterMatrix(db, query)
    logger.info('*** CLUSTERING: find clusters for seed %s ***' )
    cluster_results = clusterize(matrix_labels_and_results[1])
    logger.info('*** CLUSTERING: label clusters for seed %s ***' )
    labelled_clusters = labelClusters(cluster_results[0], matrix_labels_and_results[0])

    if seed_type == 'twitter_user':
        logger.info('*** CLUSTERING: push seed %s ***' % seed)
        user_clusters_to_neo(db, labelled_clusters, [seed], query)
    else:
        logger.warn('*** CLUSTERING: not yet implemented for seed type %s ***' % seed_type)

    db.close()
    logger.info('*** CLUSTERING FINISHED: seed %s, seed_type %s, query_name %s ***' % (seed, seed_type, query_name))
Esempio n. 3
0
def getTweets(self, user, maxTweets=3000,  count=0, tweetId=0, cacheKey=False, credentials=False):
    logger.info('Executing getTweets task id {0.id}, args: {0.args!r} kwargs: {0.kwargs!r}'.format(self.request))
    logger.info('task parent id {0.parent_id}, root id {0.root_id}'.format(self.request))
    """Get tweets from the timeline of the given user, push them to Neo4J.
    
    Positional arguments:
    user -- The screen_name of the user

    Keyword arguments:
    maxTweets -- The maximum number of tweets to retrieve
    cacheKey -- a Redis key that identifies an on-going task to grab a user's timeline
    count -- The number of tweets already retrieved, set when the task calls itself
    tweetId -- The maximum tweet ID to retrieve, set when the task calls itself
    
    """
    api = RatedTwitter(credentials=credentials)
    limit = api.get_user_timeline_wait()
    if limit:
        logger.info('*** TWITTER RATE-LIMITED: statuses.user_timeline: %s:%d  ***' % (user, str(count)))
        raise getTweets.retry(countdown=limit)
    else:
        args = {'screen_name': user, 'exclude_replies': False, 'include_rts': True, 'trim_user': False, 'count': 200}
        if tweetId:
            args['max_id'] = tweetId

        okay, result = api.get_user_timeline(**args)

        if okay:
            logger.info('*** TWITTER USER_TIMELINE: %s:%s ***' % (user, str(tweetId)))
            if result:
                newCount = count + len(result)
                if maxTweets:
                    if newCount > maxTweets: # No need for the task to call itself again.
                        pushTweets.delay(result, user, cacheKey=cacheKey) # Give pushTweets the cache-key to end the job.
                        return
                    else:
                        pushTweets.delay(result, user)

                newTweetId = min([t['id'] for t in result]) - 1
                # Not done yet, the task calls itself with an updated count and tweetId.
                getTweets.delay(user, maxTweets=maxTweets, count=newCount, tweetId=newTweetId, cacheKey=cacheKey, credentials=credentials)
            else:
                pushTweets.delay([], user, cacheKey=cacheKey)  # Nothing more found, so tell pushTweets the job is done.
        else:
            if result == '404':
                db = get_neo_driver()
                setUserDefunct(db, user)
                db.close()
            cache.set('scrape_tweets_' + self.request.root_id, 'done')
            if result == 'limited':
                raise getTweets.retry(countdown=api.get_user_timeline_wait())
Esempio n. 4
0
def getTwitterConnections(self, user, friends=True, cursor=-1, credentials=False, cacheKey=False):
    """Get the connections of the given user, push them to Neo4J.

    Positional arguments:
    user -- The screen_name of the user

    Keyword arguments:
    friends -- "twits" are the user's friends if True, (default) else they're followers 
    cacheKey -- a Redis key that identifies an on-going task to grab a user's friends or followers
    cursor -- Id of the next block of connections to retrieve, set when the task calls itself
    """
    api = RatedTwitter(credentials=credentials)
    if friends:
        method = api.get_friends_list
        limit = api.get_friends_list_wait()
        method_name = 'get_friends_list'
    else:
        method = api.get_followers_list
        limit = api.get_followers_list_wait()
        method_name = 'get_followers_list'
    
    if limit:
        logger.info('*** TWITTER RATE-LIMITED: %s:%s ***' % (method_name, str(cursor)))
        raise getTwitterConnections.retry(countdown=limit)
    else:
        okay, result = method(screen_name=user, cursor=cursor, count=200)  # We can get a maximum of 200 connections at once.
        if okay:
            logger.info('*** TWITTER CURSOR: %s:%s:%s ***' % (method_name, user, str(cursor)))
            twits = result['users']
            next_cursor = result.get('next_cursor', False)
            if next_cursor: # Unless the next cursor is 0, we're not done yet.
                getTwitterConnections.delay(user, friends=friends, cursor=next_cursor, cacheKey=cacheKey, credentials=credentials)
                pushTwitterConnections.delay(twits, user, friends=friends)
            else:
                pushTwitterConnections.delay(twits, user, friends=friends, cacheKey=cacheKey) # All done, send the cacheKey.
                    
        else:
            if result == 'limited':
                raise getTwitterConnections.retry(exc=Exception('Twitter rate-limited', method_name), countdown=API_TIMEOUT)
            if result == '404':
                db = get_neo_driver()
                setUserDefunct(db, user)
                db.close()
                if friends:
                    cache.set('scrape_friends_' + self.request.root_id, 'done')
                else:
                    cache.set('scrape_followers_' + self.request.root_id, 'done')
Esempio n. 5
0
def startUserScrape(self, user, credentials=False):
    """Start scraping around the given user."""
    logger.info('*** STARTED SCRAPING: USER: %s ***' % (user,))
    cache.set('user_scrape_' + self.request.root_id, 'true')
    cache.set('scrape_mode_' + self.request.root_id, 'user')
    cache.set('scrape_user_' + self.request.root_id, user)

    # add crawl node for this user as centre of scrape
    db = get_neo_driver()
    start_user_crawl(db, user, crawl_task=self.request.root_id, status='initiated')
    db.close()

    for key in ['scrape_friends', 'scrape_followers', 'scrape_tweets']:
        cache.set(key + '_' + self.request.root_id, '')
        
    for job in ['friends', 'followers', 'tweets']:
        cache_key = '_'.join(['nextnearest', job, user, self.request.root_id])
        cache.set(cache_key, '')
        
    doUserScrape.delay(credentials=credentials)
Esempio n. 6
0
def pushRenderedTwits2Neo(self, twits):
    db = get_neo_driver()
    users2Neo(db, twits)
    db.close()
Esempio n. 7
0
def pushRenderedConnections2Neo(self, user, renderedTwits, friends=True):
    db = get_neo_driver()
    connections2Neo(db, user,renderedTwits,friends=friends)
    db.close()
Esempio n. 8
0
def pushRenderedTweets2Neo(self, user, tweetDump):
    db = get_neo_driver()
    tweetDump2Neo(db, user, tweetDump)
    db.close()
Esempio n. 9
0
def pushRenderedMultiUserTweets2Neo(self, all_tweets_dump):
    db = get_neo_driver()
    multiUserTweetDump2Neo(db, all_tweets_dump)
    db.close()