Exemple #1
0
def snowball_following(poi_db, net_db, level, check='N'):
    #Processing max 200 users each time.
    start_level = level
    while True:
        count = poi_db.count({'level': start_level, 
                              'protected': False, 
                              'following_scrape_flag': {'$exists': False}})
        if count == 0:
            return False
        else:
            # print 'have user', count
            for user in poi_db.find({'level': start_level,
                                     'protected': False,
                                     'following_scrape_flag':
                                         {'$exists': False}},
                                    ['id_str']).limit(min(200, count)):
                # print 'a new user'
                next_cursor = -1
                params = {'user_id': user['id_str'], 'count': 5000, 'stringify_ids':True}
                # followee getting
                while next_cursor != 0:
                    params['cursor'] = next_cursor
                    followees = get_followings(params)
                    if followees:
                        followee_ids = followees['ids']
                        list_size = len(followee_ids)
                        length = int(math.ceil(list_size/100.0))
                        # print length
                        print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'Process followings', list_size, 'for user', user['id_str']
                        for index in xrange(length):
                            index_begin = index*100
                            index_end = min(list_size, index_begin+100)
                            profiles = lookup.get_users_info(followee_ids[index_begin:index_end])
                            print 'user prof:', index_begin, index_end, len(profiles)
                            # if profiles:
                            for profile in profiles:
                                check_flag = profiles_check.check_user(profile, check)
                                if check_flag:
                                    profile['following_prelevel_node'] = user['id_str']
                                    profile['level'] = start_level+1
                                    try:
                                        poi_db.insert(profile)
                                    except pymongo.errors.DuplicateKeyError:
                                        pass
                                    try:
                                        net_db.insert({'user': int(profile['id_str']), 'follower': int(user['id_str']),
                                                   'scraped_at': datetime.datetime.now()})
                                    except pymongo.errors.DuplicateKeyError:
                                        pass
                        # prepare for next iterator
                        next_cursor = followees['next_cursor']
                    else:
                        break
                poi_db.update_one({'id': int(user['id_str'])}, {'$set':{"following_scrape_flag": True
                                                    }}, upsert=False)
            return True
Exemple #2
0
def get_tweet_retweeters(tweet_id, poi_db, check='N'):
    next_cursor = -1
    params = {'id': tweet_id, 'stringify_ids': True}
    # followee getting
    while next_cursor != 0:
        params['cursor'] = next_cursor
        retweeters = get_retweeters(params)
        if retweeters:
            retweeter_ids = retweeters['ids']
            print 'Retweeters size', len(retweeter_ids)
            profiles = lookup.get_users_info(retweeter_ids)
            # if profiles:
            for profile in profiles:
                check_flag = profiles_check.check_user(profile, check)
                if check_flag:
                    try:
                        poi_db.insert(profile)
                    except pymongo.errors.DuplicateKeyError:
                        pass
        # prepare for next iterator
        next_cursor = retweeters['next_cursor']
Exemple #3
0
def get_tweet_retweeters(tweet_id, poi_db, check='N'):
    next_cursor = -1
    params = {'id': tweet_id, 'stringify_ids': True}
    # followee getting
    while next_cursor != 0:
        params['cursor'] = next_cursor
        retweeters = get_retweeters(params)
        if retweeters:
            retweeter_ids = retweeters['ids']
            print 'Retweeters size', len(retweeter_ids)
            profiles = lookup.get_users_info(retweeter_ids)
            # if profiles:
            for profile in profiles:
                check_flag = profiles_check.check_user(profile, check)
                if check_flag:
                    try:
                        poi_db.insert(profile)
                    except pymongo.errors.DuplicateKeyError:
                        pass
        # prepare for next iterator
        next_cursor = retweeters['next_cursor']
Exemple #4
0
def snowball_follower(poi_db, net_db, level, check='N'):
    #Processing max 200 users each time.
    start_level = level
    while True:
        count = poi_db.find_one({
            'level': start_level,
            'protected': False,
            'follower_scrape_flag': {
                '$exists': False
            }
        })
        if count is None:
            return False
        else:
            for user in poi_db.find(
                {
                    'level': start_level,
                    'protected': False,
                    'follower_scrape_flag': {
                        '$exists': False
                    }
                }, ['id_str']).limit(200):
                next_cursor = -1
                params = {
                    'user_id': user['id_str'],
                    'count': 5000,
                    'stringify_ids': True
                }
                # follower getting
                while next_cursor != 0:
                    params['cursor'] = next_cursor
                    followers = get_followers(params)
                    if followers:
                        follower_ids = followers['ids']
                        list_size = len(follower_ids)
                        length = int(math.ceil(list_size / 100.0))
                        # print length
                        print datetime.datetime.now().strftime(
                            "%Y-%m-%d-%H-%M-%S"
                        ), 'Process followers', list_size, 'for user', user[
                            'id_str']
                        for index in xrange(length):
                            index_begin = index * 100
                            index_end = min(list_size, index_begin + 100)
                            profiles = lookup.get_users_info(
                                follower_ids[index_begin:index_end])

                            if profiles:
                                print 'user prof:', index_begin, index_end, len(
                                    profiles)
                                for profile in profiles:
                                    check_flag = profiles_check.check_user(
                                        profile, check)
                                    if check_flag:
                                        profile[
                                            'follower_prelevel_node'] = user[
                                                'id_str']
                                        profile['level'] = start_level + 1
                                        try:
                                            poi_db.insert(profile)
                                        except pymongo.errors.DuplicateKeyError:
                                            pass
                                        try:
                                            net_db.insert({
                                                'user':
                                                int(user['id_str']),
                                                'follower':
                                                int(profile['id_str']),
                                                'scraped_at':
                                                datetime.datetime.now().
                                                strftime(
                                                    '%a %b %d %H:%M:%S +0000 %Y'
                                                )
                                            })
                                        except pymongo.errors.DuplicateKeyError:
                                            pass
                    # prepare for next iterator
                        next_cursor = followers['next_cursor']
                    else:
                        break
                poi_db.update_one({'id': int(user['id_str'])},
                                  {'$set': {
                                      "follower_scrape_flag": True
                                  }},
                                  upsert=False)
            continue
Exemple #5
0
def snowball_following_proportion(poi_db,
                                  net_db,
                                  level,
                                  check='N',
                                  proportation=0.1):
    #Processing max 200 users each time., only retrieve 10% followings
    start_level = level
    while True:
        count = poi_db.find_one({
            'level': start_level,
            'protected': False,
            'following_scrape_flag': {
                '$exists': False
            }
        })
        if count is None:
            return False
        else:
            # print 'have user', count
            for user in poi_db.find(
                {
                    'level': start_level,
                    'protected': False,
                    'following_scrape_flag': {
                        '$exists': False
                    }
                }, ['id_str', 'friends_count']).limit(200):
                # print 'a new user'
                following_limit = int(user['friends_count'] * proportation)
                next_cursor = -1
                params = {'user_id': user['id_str'], 'stringify_ids': True}
                # followee getting
                while next_cursor != 0 and following_limit > 0:
                    params['cursor'] = next_cursor
                    print user['id_str'], ' following limit ', following_limit
                    params['count'] = min(following_limit, 5000)
                    followees = get_followings(params)
                    if followees:
                        followee_ids = followees['ids']
                        list_size = len(followee_ids)
                        following_limit -= list_size
                        length = int(math.ceil(list_size / 100.0))
                        # print length
                        print datetime.datetime.now().strftime(
                            "%Y-%m-%d-%H-%M-%S"
                        ), 'Process followings', list_size, 'for user', user[
                            'id_str']
                        for index in xrange(length):
                            index_begin = index * 100
                            index_end = min(list_size, index_begin + 100)
                            profiles = lookup.get_users_info(
                                followee_ids[index_begin:index_end])
                            if profiles:
                                print 'user prof:', index_begin, index_end, len(
                                    profiles)
                                for profile in profiles:
                                    check_flag = profiles_check.check_user(
                                        profile, check)
                                    if check_flag:
                                        profile[
                                            'following_prelevel_node'] = user[
                                                'id_str']
                                        profile['level'] = start_level + 1
                                        probablity = float(
                                            user['friends_count'] + 1) / (
                                                profile['followers_count'] + 1)
                                        # probablity = 1.0/profile['followers_count']
                                        randomv = random.uniform(0, 1)
                                        if randomv <= probablity:
                                            try:
                                                poi_db.insert(profile)
                                            except pymongo.errors.DuplicateKeyError:
                                                pass
                                            try:
                                                net_db.insert({
                                                    'user':
                                                    int(profile['id_str']),
                                                    'follower':
                                                    int(user['id_str']),
                                                    'scraped_at':
                                                    datetime.datetime.now()
                                                })
                                            except pymongo.errors.DuplicateKeyError:
                                                pass
                        # prepare for next iterator
                        next_cursor = followees['next_cursor']
                    else:
                        break
                poi_db.update_one({'id': int(user['id_str'])},
                                  {'$set': {
                                      "following_scrape_flag": True
                                  }},
                                  upsert=False)
            return True