def crawl(
    screen_names=[],
    friends_limit=10000,
    followers_limit=10000,
    depth=1,
    friends_sample=0.2, #XXX
    followers_sample=0.0,
    ):
    
    def crawlmapper(screen_name):
        if  r.get(getRedisIdByScreenName(screen_name,'crawled_in_60min')) is None:
            friends_info = getFriendsBatch(screen_name,friends_limit)
            map(lambda x:
                    r.sadd(getRedisIdByScreenName(screen_name, 'friend_ids'), 
                           x['id']),
                friends_info)
            scard = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids'))
            print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name)

            
            followers_info = getFollowersBatch(screen_name,followers_limit)
            map(lambda x:
                    r.sadd(getRedisIdByScreenName(screen_name, 'follower_ids'),
                           x['id']),
                followers_info)
            scard = r.scard(getRedisIdByScreenName(screen_name, 'follower_ids'))
            print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name)
            if friends_info==[] or followers_info==[]:
                r.set(getRedisIdByScreenName(screen_name,'crawled_in_60min'),'1')
                r.expire(getRedisIdByScreenName(screen_name,'crawled_in_60min'),3600)
        else:
            friends_info=map(RedisUserId2UserInfoWraper,
                             list(r.smembers(getRedisIdByScreenName(screen_name,'friend_ids'))))
            followers_info=map(RedisUserId2UserInfoWraper,
                               list(r.smembers(getRedisIdByScreenName(screen_name,'follower_ids'))))
        
        return map(lambda u1: u1['screen_name'],
                   filter(lambda info:
                              (info is not None and 
                               info['followers_count']<1000 and
                               info['friends_count']<1000), #filter Public Intellectual and Zombie
                          flat(map(samplemapper,
                                   [friends_info,followers_info],
                                   [friends_sample,followers_sample]))))
    
    getUserInfo(t, r, screen_names=screen_names)
    d=0
    while d<depth:
        d+=1
        screen_names=flat(map(crawlmapper,screen_names))
        print 'crawled ',len(screen_names) ,'ids'
Example #2
0
def crawl(
    screen_names,
    friends_limit=10000,
    followers_limit=10000,
    depth=1,
    friends_sample=0.3, #XXX
    followers_sample=0.3,
    ):

    response = getUserInfo(t, r, screen_names=screen_names)
    ## ymir 
    r.sadd('justin-ids',response[0]['id'])
    print "Appended justins-ids. Now it has " + unicode(r.scard('justin-ids'))



    for screen_name in screen_names:
        friend_ids = getFriends(screen_name, limit=friends_limit)
        follower_ids = getFollowers(screen_name, limit=followers_limit)

        friends_info = getUserInfo(t, r, user_ids=friend_ids, 
                                   sample=friends_sample)

        followers_info = getUserInfo(t, r, user_ids=follower_ids,
                                     sample=followers_sample)

        next_queue = [u['screen_name'] for u in friends_info + followers_info]

        d = 1
        while d < depth:
            d += 1
            (queue, next_queue) = (next_queue, [])
            for _screen_name in queue:
                friend_ids = getFriends(_screen_name, limit=friends_limit)
                follower_ids = getFollowers(_screen_name, limit=followers_limit)

                next_queue.extend(friend_ids + follower_ids)

                # Note that this function takes a kw between 0.0 and 1.0 called
                # sample that allows you to crawl only a random sample of nodes
                # at any given level of the graph

                getUserInfo(t,r,user_ids=next_queue)
def crawl(
    screen_names, friends_limit=10000, followers_limit=10000, depth=1, friends_sample=0.2, followers_sample=0.0  # XXX
):

    getUserInfo(t, r, screen_names=screen_names)
    for screen_name in screen_names:
        friend_ids = getFriends(screen_name, limit=friends_limit)
        follower_ids = getFollowers(screen_name, limit=followers_limit)

        friends_info = getUserInfo(t, r, user_ids=friend_ids, sample=friends_sample)

        followers_info = getUserInfo(t, r, user_ids=follower_ids, sample=followers_sample)

        next_queue = [u["screen_name"] for u in friends_info + followers_info]

        d = 1
        while d < depth:
            d += 1
            (queue, next_queue) = (next_queue, [])
            for _screen_name in queue:
                friend_ids = getFriends(_screen_name, limit=friends_limit)
                follower_ids = getFollowers(_screen_name, limit=followers_limit)

                next_queue.extend(friend_ids + follower_ids)

                # Note that this function takes a kw between 0.0 and 1.0 called
                # sample that allows you to crawl only a random sample of nodes
                # at any given level of the graph

                getUserInfo(user_ids=next_queue)
def crawl(
    screen_names,
    friends_limit=10000,
    followers_limit=100000,
    depth=1,
    friends_sample=0.2, #XXX
    followers_sample=0.2,
    ):

    logging.info("Getting user info")
    getUserInfo(t, r, screen_names=screen_names)
    for screen_name in screen_names:
        
        friend_ids = getFriends(screen_name, limit=friends_limit)
        logging.info("Retrieved %d friends ids", len(friend_ids))
        
        follower_ids = getFollowers(screen_name, limit=followers_limit)
        logging.info("Retrieved %d follower ids", len(follower_ids))

        friends_info = getUserInfo(t, r, user_ids=friend_ids, 
                                   sample=friends_sample)
        logging.info("Retrieved user info for %d friends", len(friends_info) )

        logging.info("Getting follower info")
        followers_info = getUserInfo(t, r, user_ids=follower_ids,
                                     sample=followers_sample)
        logging.info("Retrieved user info for %d followers", len(followers_info) )

        next_queue = [u['screen_name'] for u in friends_info + followers_info]

        d = 1
        while d < depth:
            logging.info("while loop: depth = %d, d = %d", depth, d)
            d += 1
            (queue, next_queue) = (next_queue, [])
            for _screen_name in queue:
                logging.info("while loop: screen_name: %s:", _screen_name)
                friend_ids = getFriends(_screen_name, limit=friends_limit)
                follower_ids = getFollowers(_screen_name, limit=followers_limit)

                next_queue.extend(friend_ids + follower_ids)

                # Note that this function takes a kw between 0.0 and 1.0 called
                # sample that allows you to crawl only a random sample of nodes
                # at any given level of the graph
                logging.info("while loop: getting user info")
                getUserInfo(user_ids=next_queue)
def crawl(
    screen_names,
    friends_limit=10000,
    followers_limit=10000,
    depth=1,
    friends_sample=0.2,  #XXX
    followers_sample=0.0,
):

    getUserInfo(t, r, screen_names=screen_names)
    for screen_name in screen_names:
        friend_ids = getFriends(screen_name, limit=friends_limit)
        follower_ids = getFollowers(screen_name, limit=followers_limit)

        friends_info = getUserInfo(t,
                                   r,
                                   user_ids=friend_ids,
                                   sample=friends_sample)

        followers_info = getUserInfo(t,
                                     r,
                                     user_ids=follower_ids,
                                     sample=followers_sample)

        next_queue = [u['screen_name'] for u in friends_info + followers_info]

        d = 1
        while d < depth:
            d += 1
            (queue, next_queue) = (next_queue, [])
            for _screen_name in queue:
                friend_ids = getFriends(_screen_name, limit=friends_limit)
                follower_ids = getFollowers(_screen_name,
                                            limit=followers_limit)

                next_queue.extend(friend_ids + follower_ids)

                # Note that this function takes a kw between 0.0 and 1.0 called
                # sample that allows you to crawl only a random sample of nodes
                # at any given level of the graph

                getUserInfo(user_ids=next_queue)
Example #6
0
# -*- coding: utf-8 -*-

import sys
import json
import redis
from twitter__login import login

# A makeTwitterRequest call through to the /users/lookup
# resource, which accepts a comma separated list of up
# to 100 screen names. Details are fairly uninteresting.
# See also http://dev.twitter.com/doc/get/users/lookup
from twitter__util import getUserInfo

if __name__ == "__main__":
    screen_names = sys.argv[1:]

    t = login()
    r = redis.Redis()

    print json.dumps(getUserInfo(t, r, screen_names=screen_names), indent=4)
# -*- coding: utf-8 -*-

import sys
import json
import redis
from twitter__login import login

# A makeTwitterRequest call through to the /users/lookup 
# resource, which accepts a comma separated list of up 
# to 100 screen names. Details are fairly uninteresting. 
# See also http://dev.twitter.com/doc/get/users/lookup
from twitter__util import getUserInfo

if __name__ == "__main__":
    screen_names = sys.argv[1:]

    t = login()
    r = redis.Redis()

    print json.dumps(
            getUserInfo(t, r, screen_names=screen_names),
            indent=4
          )
Example #8
0
# -*- coding: utf-8 -*-

import sys
import json
import redis
from twitter__login import login

# A makeTwitterRequest call through to the /users/lookup 
# resource, which accepts a comma separated list of up 
# to 100 screen names. Details are fairly uninteresting. 
# See also http://dev.twitter.com/doc/get/users/lookup
#
# JW: adapted original code to retrieve user_ids from redis
# and pass them to the getUserInfo function.
#
from twitter__util import getUserInfo

t = login()
r = redis.Redis()

friend_ids = list( r.smembers("screen_name$timoreilly$friend_ids") )

user_info = getUserInfo(t, r, user_ids = friend_ids)

print json.dumps( user_info, indent=4)
# -*- coding: utf-8 -*-

import sys
import json
import redis
from twitter__login import login
import functools

# A makeTwitterRequest call through to the /users/lookup 
# resource, which accepts a comma separated list of up 
# to 100 screen names. Details are fairly uninteresting. 
# See also http://dev.twitter.com/doc/get/users/lookup
from twitter__util import getUserInfo, _getFriendsOrFollowersUsingFunc




if __name__ == "__main__":
    uid = sys.argv[1:]

    t = login()
    r = redis.Redis()

    info = getUserInfo(t, r, user_ids=uid)
Example #10
0
import redis

# A makeTwitterRequest call through to the /users/lookup 
# resource, which accepts a comma separated list of up 
# to 100 screen names. Details are fairly uninteresting. 
# See also http://dev.twitter.com/doc/get/users/lookup
from twitter__util import getUserInfo

if __name__ == "__main__":
    # XXX: iPython Notebook cannot prompt for input
    screen_names = ['timoreilly', 'socialwebmining', 'ptwobrussell']

    r = redis.Redis()

    print json.dumps(
            getUserInfo(t, r, screen_names=screen_names),
            indent=4
          )

# <markdowncell>

# Example 4-7. Finding common friends/followers for multiple Twitterers, with output that's easier on the eyes (friends_followers__friends_followers_in_common.py)

# <codecell>

import sys
import redis

from twitter__util import getRedisIdByScreenName

# A pretty-print function for numbers
Example #11
0
def crawl(
    screen_names=[],
    friends_limit=10000,
    followers_limit=10000,
    depth=1,
    friends_sample=0.2,  #XXX
    followers_sample=0.0,
):
    def crawlmapper(screen_name):
        if r.get(getRedisIdByScreenName(screen_name,
                                        'crawled_in_60min')) is None:
            friends_info = getFriendsBatch(screen_name, friends_limit)
            map(
                lambda x: r.sadd(
                    getRedisIdByScreenName(screen_name, 'friend_ids'), x['id']
                ), friends_info)
            scard = r.scard(getRedisIdByScreenName(screen_name, 'friend_ids'))
            print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name)

            followers_info = getFollowersBatch(screen_name, followers_limit)
            map(
                lambda x: r.sadd(
                    getRedisIdByScreenName(screen_name, 'follower_ids'), x[
                        'id']), followers_info)
            scard = r.scard(getRedisIdByScreenName(screen_name,
                                                   'follower_ids'))
            print >> sys.stderr, 'Fetched %s ids for %s' % (scard, screen_name)
            if friends_info == [] or followers_info == []:
                r.set(getRedisIdByScreenName(screen_name, 'crawled_in_60min'),
                      '1')
                r.expire(
                    getRedisIdByScreenName(screen_name, 'crawled_in_60min'),
                    3600)
        else:
            friends_info = map(
                RedisUserId2UserInfoWraper,
                list(
                    r.smembers(
                        getRedisIdByScreenName(screen_name, 'friend_ids'))))
            followers_info = map(
                RedisUserId2UserInfoWraper,
                list(
                    r.smembers(
                        getRedisIdByScreenName(screen_name, 'follower_ids'))))

        return map(
            lambda u1: u1['screen_name'],
            filter(
                lambda info: (info is not None and info[
                    'followers_count'] < 1000 and info['friends_count'] < 1000
                              ),  #filter Public Intellectual and Zombie
                flat(
                    map(samplemapper, [friends_info, followers_info],
                        [friends_sample, followers_sample]))))

    getUserInfo(t, r, screen_names=screen_names)
    d = 0
    while d < depth:
        d += 1
        screen_names = flat(map(crawlmapper, screen_names))
        print 'crawled ', len(screen_names), 'ids'
Example #12
0
import redis
import json
from twitter__login import login
from twitter__util import getUserInfo
from twitter__util import _getFriendsOrFollowersUsingFunc
from twitter__util import getRedisIdByScreenName
from twitter__util import getRedisIdByUserId

SCREEN_NAME = sys.argv[1]
MAXINT = sys.maxint

t = login()
r = redis.Redis()

# get info and friends for central user
getUserInfo(t,r,[SCREEN_NAME])
getFriends = functools.partial(_getFriendsOrFollowersUsingFunc, t.friends.ids, 'friend_ids', t, r)
getFollowers = functools.partial(_getFriendsOrFollowersUsingFunc, t.followers.ids, 'follower_ids', t, r)

# get friends and followers of central user
friend_ids = getFriends(SCREEN_NAME)
follower_ids = getFollowers(SCREEN_NAME)

# do union of friends and followers
ids = list(r.sunion(getRedisIdByScreenName(SCREEN_NAME,'friend_ids'), getRedisIdByScreenName(SCREEN_NAME,'follower_ids')))

# get user info for friends and followers
getUserInfo(t, r, user_ids=ids)

# get friends of friends and followers
for user_id in ids:
Example #13
0
for screen_name in screen_names:
	if screen_name != None:

		print >> sys.stderr, 'Getting friends for %s...' % (screen_name, )
		friends_ids = getFriends(screen_name, limit=200)
		print >> sys.stderr, 'Getting followers for %s...' % (screen_name, )
		followers_ids = getFollowers(screen_name, limit=200)
		# make union of friends and followers
		union = r.sunion([getRedisIdByScreenName(screen_name,'friends_ids'),getRedisIdByScreenName(screen_name, 'follower_ids')])
		# convert from set to list
		union = list(union)
		# we need just 200 of them
		union = union[1:200]

		# get info for all 200; needed for the location
		friends_info = getUserInfo(t, r, user_ids=union, sample=1.0)
		

		print "Now harvesting ", screen_name,"'s friends subgraphs"
		for current_friend in friends_info:
			if current_friend != None:

				print "+",current_friend['screen_name']," From ",
				if current_friend['location'] != None and current_friend['location']!= "" :
					print current_friend['location'].encode('utf-8')
				else:
					print " "
				
				friend_ids = getFriends(current_friend['screen_name'], limit=200)

# -*- coding: utf-8 -*-

import sys
import json
import redis
from twitter__login import login
from twitter__util import getUserInfo

if __name__ == "__main__":
    screen_names = sys.argv[1:]
    
    t = login()
    r = redis.Redis()
    print json.dumps(getUserInfo(t,r, screen_names=screen_names), indent=4)