Example #1
0
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_out_limits={}, collection_limits={}):
    """
    This function creates a network based on Twitter friends

    @param user_seed           - List of user names
    @param host                -
    @param port                -
    @param timeline_start_date - Beginning of date (datetime.date object) of timelines in collection
    @param save_dir     - Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory.
                          EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons',
                                          'twitter_timelines': '/dir/to/save/timeline/jsons'}
    @param hop_limits   - Specify your graph constrains with the variable hop_out_limits. First determine the maximum
                          number of hops to make the graph with 'max_hops', then decide the maximum amount of data to
                          collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to
                          'None' if you don't want to limit the amount of data collected. Next, set limits (per
                          individual) on how many friends, followers, replied to users, and mentioned users to include
                          in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you
                          do not wish to limit the collection, and will expand the graph on as many as these edges as
                          possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note
                          that friends and followers will be saved in the fields 'friends_list' and 'followers_list'
                          automatically. The reply and mention users are saved in timelines.
                          EX. hop_out_limits = {'max_hops': 2,              # Maximin number of hops in graph
                                                'max_data': None,           # Maximum amount of data (in GB)
                                                'friends': 0,               # Maximum friends per user to include in next hop
                                                'followers': None,          # Maximum followers per user to include in next hop
                                                'in_reply_to_user_id': 17,  # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop
                                                'user_mention_id': 21}      # Maximum 'user_mention_id' per user's timeline to include in next hop

    @param collection_limits - Suppose that you want to store friends or followers, but do not want to expand the graph
                    based on them. Specify limitations on collecting friends and followers below. Notice that reply and mention users
                    are saved in the timelines. The largest possible length of 'friends_list' will be the greater of hops out limit and
                    collection limit, or MAX(hops_out_limit['friends'], collection_limits['friends']). The same description follows for
                    'followers_list'.
                    EX. collection_limits = {'friends': 0,      # Maximum number of friends per user to save within the profile .JSON
                                         'followers': None}     # Maximum number of followers per user to save within the profile .JSON
    """
    # CHECK PARAMETERS
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''):
        save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles')
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles'])
    if not os.path.isdir(save_dir['twitter_profiles']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles'])
        os.mkdir(save_dir['twitter_profiles'])
    if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''):
        save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines')
        print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(save_dir['twitter_timelines'])
    if not os.path.isdir(save_dir['twitter_timelines']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_timelines'])
        os.mkdir(save_dir['twitter_timelines'])
    # Check data amount and quit if graph has reached limit
    if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None):
        data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines'])
        if (data_vol > hop_out_limits['max_data']):
            print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data'])
            return
    # Check hop_out_limits dictionary
    if 'max_hops' not in hop_out_limits:
        hop_out_limits['max_hops'] = 6
        print "\tNo value was specified for hop_out_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_out_limits['max_hops'])
    if 'max_data' not in hop_out_limits:
        hop_out_limits['max_data'] = 2
        print "\tNo value was specified for hop_out_limits['max_data'], the maximin amount of data collected (in GB), so it will be set to {}.".format(hop_out_limits['max_data'])
    if 'friends' not in hop_out_limits:
        hop_out_limits['friends'] = 0
        print "\tNo value was specified for hop_out_limits['friends'], max friends per user to include in next hop, so it will be set to 0."
    if 'followers' not in hop_out_limits:
        hop_out_limits['followers'] = 0
        print "\tNo value was specified for hop_out_limits['followers'], max followers per user to include in next hop, so it will be set to 0."
    if 'in_reply_to_user_id' not in hop_out_limits:
        hop_out_limits['in_reply_to_user_id'] = 0
        print "\tNo value was specified for hop_out_limits['in_reply_to_user_id'], max 'in_reply_to_user_id' per user's timeline to include in next hop, so it will be set to 0."
    if 'user_mention_id' not in hop_out_limits:
        hop_out_limits['user_mention_id'] = 0
        print "\tNo value was specified for hop_out_limits['user_mention_id'], max 'user_mention_id' per user's timeline to include in next hop, so it will be set to 0."
    # Check collection_limits dictionary
    if 'friends' not in collection_limits:
        collection_limits['friends'] = 0
        print "\tNo value was specified for collection_limits['friends'], max number of friends per user to save with the profile .JSON, so it will be set to 0."
    if 'followers' not in collection_limits:
        collection_limits['followers'] = 0
        print "\tNo value was specified for collection_limits['followers'], max number of followers per user to save with the profile .JSON, so it will be set to 0."
    # DETERMINE COLLECTION PARAMETERS
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = load_place_savers(save_dir['twitter_profiles'])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles'])
    print "\tThe current hop is {}".format(place_savers['cur_hop'])
    if place_savers['cur_hop'] < 1:
        place_savers['cur_user_list'] = set(user_seed)
    print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop'])
    print "\tSo far we plan to collect {} users in hop {}".format(len(place_savers['next_user_list']), place_savers['cur_hop'] + 1)
    # Determine limits for friends/followers collection -
    if None in [hop_out_limits['friends'], collection_limits['friends']]:
        MAX_FRIENDS = None
    else:
        MAX_FRIENDS = max(hop_out_limits['friends'], collection_limits['friends'])
    if None in [hop_out_limits['followers'], collection_limits['followers']]:
        MAX_FOLLOWERS = None
    else:
        MAX_FOLLOWERS = max(hop_out_limits['followers'], collection_limits['followers'])
    # Create proxies dictionary
    proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # API AUTHORIZATION
    print "\nAPI Authorization"
    auth = pyTweet.get_authorization(twitter_keys)
    # BUILD THE GRAPH
    print "\nStart building the graph!"
    for i in range(place_savers['cur_hop'], hop_out_limits['max_hops']):
        print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(i, len(place_savers['cur_user_list']))
        print "Create the user list of the " + str(i+1) + "th-hop users."
        # Remove finished_users from next_user_list
        if (place_savers['cur_hop'] > 0):
            place_savers['cur_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys())))
        # Separate list for faster results, and delete place_savers['cur_user_list'] to free space
        USERS = [list(place_savers['cur_user_list'])[z:z+100] for z in range(0, len(place_savers['cur_user_list']), 100)]
        del place_savers['cur_user_list']   # save space
        for j in range(len(USERS)):
            # Look up information of users, 100 at a time
            print "\tLook up user information"
            if i < 1:
                # The initial list contain user names or @handles
                user_info = pyTweet.user_lookup_usernames(user_list=USERS[j], proxies=proxies, auth=auth)
                USERS[j] = set([])
                for jj in range(len(user_info)):
                    USERS[j].add(int(str(user_info[jj]['id'])))
            else:
                # All other lists will contain user ids
                user_info = pyTweet.user_lookup_userids(user_list=USERS[j], proxies=proxies, auth=auth)
            # Get friends, followers, and timelines of each user in user_info
            for k in range(len(user_info)):
                id = str(user_info[k]['id'])
                # Check to see that the user's friend/follower list hasn't already been collected
                if id in place_savers['finished_users'].keys():
                    # Load previously saved user data
                    pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_' + str(place_savers['finished_users'][id]) + '.json')
                    if os.path.getsize(pro_filename) == 0:
                        # File exists but it is empty
                        user_data = user_info[k]
                        user_data['khop'] = i
                        user_data['DOC'] = datetime.datetime.utcnow()
                        fast_save(filename=pro_filename, obj=user_data)
                    else:
                        try:
                            # Open and read profile .json
                            jfid = open(pro_filename)
                            user_data = ujson.load(jfid)
                            user_data['DOC'] = datetime.datetime.utcnow()
                            jfid.close()
                        except ValueError:
                            # Fail at opening profile .json, resave it
                            user_data = user_info[k]
                            user_data['khop'] = i
                            user_data['DOC'] = datetime.datetime.utcnow()
                            fast_save(filename=pro_filename, obj=user_data)
                else:
                    # The user's profile has not been collected...start now
                    place_savers['finished_users'][id] = str(uuid.uuid4())
                    pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_{}.json'.format(str(place_savers['finished_users'][id])))
                    # Add user information: hop, DOC
                    user_data = user_info[k]
                    user_data['khop'] = i
                    user_data['DOC'] = datetime.datetime.utcnow()
                    fast_save(filename=pro_filename, obj=user_data)
                print "\tSaved user {} information in {}.".format(id, pro_filename)
                # Collect user friends
                if 'friends_list' not in user_data:
                    friends_list = []
                    if (user_data['friends_count'] > 0) and ((MAX_FRIENDS is None) or (MAX_FRIENDS > 0)):
                        print "\tCollect friends for user {}.".format(id)
                        friends_list = pyTweet.get_user_friends(user_id=id, limit=MAX_FRIENDS, proxies=proxies, auth=auth)
                    user_data['friends_list'] = friends_list
                    fast_save(filename=pro_filename, obj=user_data)
                place_savers['next_user_list'].difference_update(set(user_data['friends_list'][0:hop_out_limits['friends']]))    # Add friends to next_user_list
                save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Collect user followers
                if 'followers_list' not in user_data:
                    followers_list = []
                    if (user_data['followers_count'] > 0) and ((MAX_FOLLOWERS is None) or (MAX_FOLLOWERS > 0)):
                        print "\tCollect followers for user {}.".format(id)
                        followers_list = pyTweet.get_user_followers(user_id=id, limit=MAX_FOLLOWERS, proxies=proxies, auth=auth)
                    user_data['followers_list'] = followers_list
                    fast_save(filename=pro_filename, obj=user_data)
                place_savers['next_user_list'].difference_update(set(user_data['followers_list'][0:hop_out_limits['followers']]))  # Add followers to next_user_list
                save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Collect timeline for user beginning from start_date
                tl_filename = os.path.join(save_dir['twitter_timelines'], 'timeline_{}.json'.format(place_savers['finished_users'][id]))
                if os.path.isfile(tl_filename):
                    print "\tThe timeline for user {} has already been collected.".format(id)
                    # Load timeline file
                    if os.path.getsize(tl_filename) == 0: continue      # Skip empty time lines
                    try:
                        jfid = open(tl_filename)
                        tldata = ujson.load(jfid)
                        jfid.close()
                    except (IOError, ValueError):
                        # Fail at opening file, recollect time line
                        print "\tCollect the timeline for user {}.".format(id)
                        tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth)
                        for tl in range(len(tldata)):
                            tldata[tl]['DOC'] = datetime.datetime.utcnow()
                        fast_save(filename=tl_filename, obj=tldata)
                else:
                    print "\tCollect the timeline for user {}.".format(id)
                    tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth)
                    for tl in range(len(tldata)):
                        tldata[tl]['DOC'] = datetime.datetime.utcnow()
                    fast_save(filename=tl_filename, obj=tldata)

                # Pull out user mentions, if applicable
                if ('user_mention_id' in hop_out_limits) and ((hop_out_limits['user_mention_id'] > 0) or (hop_out_limits['user_mention_id'] is None)):
                    print "\tAdd user mentions to the next hop"
                    tl_mentions = pyTweet.pull_timeline_entitites(timeline=tldata, type='user_mention_id', limit=hop_out_limits['user_mention_id'])
                    place_savers['next_user_list'].update(tl_mentions)
                    save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Pull out user replies, if applicable
                if ('in_reply_to_user_id' in hop_out_limits) and ((hop_out_limits['in_reply_to_user_id'] > 0) or (hop_out_limits['in_reply_to_user_id'] is None)):
                    print "\tAdd replies to the next hop"
                    tl_replies = pyTweet.pull_timeline_entitites(timeline=tldata, type='in_reply_to_user_id', limit=hop_out_limits['in_reply_to_user_id'])
                    place_savers['next_user_list'].update(tl_replies)
                    save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Check data amount and quit if graph has reached limit
                if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None):
                    data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines'])
                    if (data_vol > hop_out_limits['max_data']):
                        print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data'])
                        return
        # Remove finished_users from place_savers['next_user_list']
        place_savers['next_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys())))
        # Prepare for next iteration of hop
        place_savers['cur_user_list'] = place_savers['next_user_list']
        place_savers['next_user_list'] = set([])
        place_savers['cur_hop'] += 1
        save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
        print "There are ", len(place_savers['cur_user_list']), " users in the next iteration of users."
    print "\nDone building graph!"
Example #2
0
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}):
    """
    This function creates a network based on Twitter friends

    :param user_seed: List of user names
    :param host: Your host IP
    :param port: Your port
    :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection
    :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory.
                          EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons',
                                          'twitter_timelines': '/dir/to/save/timeline/jsons'}

    :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum
                          number of hops to make the graph with 'max_hops', then decide the maximum amount of data to
                          collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to
                          'None' if you don't want to limit the amount of data collected. Next, set limits (per
                          individual) on how many friends, followers, replied to users, and mentioned users to include
                          in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you
                          do not wish to limit the collection, and will expand the graph on as many as these edges as
                          possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note
                          that friends and followers will be saved in the fields 'friends_list' and 'followers_list'
                          automatically. The reply and mention users are saved in timelines.
                          EX.hop_limits = {'max_hops': 2,              # Maximin number of hops in graph
                                                'max_data': None,           # Maximum amount of data (in GB)
                                                'friends': 0,               # Maximum friends per user to include in next hop
                                                'followers': None,          # Maximum followers per user to include in next hop
                                                'in_reply_to_user_id': 17,  # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop
                                                'user_mention_id': 21}      # Maximum 'user_mention_id' per user's timeline to include in next hop
    """
    # CHECK PARAMETERS
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ("twitter_profiles" not in save_dir.keys()) or (save_dir["twitter_profiles"].strip() == ""):
        save_dir["twitter_profiles"] = os.path.join(os.getcwd(), "profiles")
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(
            save_dir["twitter_profiles"]
        )
    if not os.path.isdir(save_dir["twitter_profiles"]):
        print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_profiles"])
        os.mkdir(save_dir["twitter_profiles"])
    if ("twitter_timelines" not in save_dir.keys()) or (save_dir["twitter_timelines"].strip() == ""):
        save_dir["twitter_timelines"] = os.path.join(os.getcwd(), "timelines")
        print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(
            save_dir["twitter_timelines"]
        )
    if not os.path.isdir(save_dir["twitter_timelines"]):
        print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_timelines"])
        os.mkdir(save_dir["twitter_timelines"])
    # Checkhop_limits dictionary
    hop_limits_defaults = {
        "max_hops": [6, "the maximin number of hops in graph"],
        "max_data": [2, "the maximin amount of data collected (in GB)"],
        "friends": [0, "max friends per user to include in next hop"],
        "followers": [0, "max followers per user to include in next hop"],
        "in_reply_to_user_id": [0, "max 'in_reply_to_user_id' per user's timeline to include in next hop"],
        "user_mention_id": [0, "max 'user_mention_id' per user's timeline to include in next ho"],
    }
    for kk in hop_limits_defaults.keys():
        if kk not in hop_limits:
            hop_limits[kk] = hop_limits_defaults[kk][0]
            print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format(
                kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0]
            )
    # Check data amount and quit if graph has reached limit
    if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None):
        data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"])
        if data_vol > hop_limits["max_data"]:
            print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                data_vol, hop_limits["max_data"]
            )
            return

    # DETERMINE COLLECTION PARAMETERS
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = load_place_savers(save_dir["twitter_profiles"])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(
        len(place_savers["finished_users"]), save_dir["twitter_profiles"]
    )
    print "\tThe current hop is {}".format(place_savers["cur_hop"])
    if place_savers["cur_hop"] < 1:
        place_savers["cur_user_list"] = set(user_seed)
    save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
    print "\tWe will collect {} users in hop {}".format(len(place_savers["cur_user_list"]), place_savers["cur_hop"])
    # Load growth parametes
    growth_params = load_growth_params(save_dir["twitter_profiles"])
    # Create proxies dictionary
    proxies = {"http": "http://%s:%s" % (host, port), "https": "http://%s:%s" % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # API authorization
    auth = pyTweet.get_authorization(twitter_keys)

    # BUILD THE GRAPH
    print "\nStart building the graph!"
    for khop in range(place_savers["cur_hop"], hop_limits["max_hops"]):
        print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(
            khop, len(place_savers["cur_user_list"])
        )
        print "Create the user list of the {}th-hop users as well.".format(khop + 1)

        # Get profile information of users in cur_user_list
        print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS"
        if khop < 1:
            # Find profiles to collect
            profiles_to_collect = set(place_savers["cur_user_list"])
            for json_filename in place_savers["finished_users"].values():
                data = ujson.load(open(os.path.join(save_dir["twitter_profiles"], json_filename), "r"))
                if data["screen_name"] in place_savers["cur_user_list"]:
                    profiles_to_collect.discard(data["screen_name"])
            # Collect and save profiles
            user_info = []
            if len(profiles_to_collect) > 0:
                print "\nstart collecting profiles: {} profiles".format(len(profiles_to_collect))
                user_info = pyTweet.user_lookup_usernames(
                    user_list=list(profiles_to_collect), proxies=proxies, auth=auth
                )
                if isinstance(user_info, dict) and ("errors" in user_info.keys()):
                    print "\nThe initial seed cannot be collected..."
                    print "Twitter error message: ", user_info
                # Save profile information
                # print "user_info: ", user_info
                # print type(user_info)
                for udata in user_info:
                    # print "udata: ", udata
                    # print type(udata)
                    json_filename = _save_profile_json(
                        profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][udata["id"]] = json_filename
            # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well
            all_screennames = {}  # Keys are screen names and values are file name
            jsons = filter(lambda k: re.match("userInfo_*", k), os.listdir(save_dir["twitter_profiles"]))
            for jj in jsons:
                try:
                    full_filename = os.path.join(save_dir["twitter_profiles"], jj)
                    if os.path.getsize(full_filename) != 0:
                        jfid = open(full_filename)
                        profile = ujson.load(jfid)
                        jfid.close()
                        all_screennames[profile["screen_name"]] = jj
                except ValueError:
                    continue
            # Get corresponding user IDs for each screen name in cur_user_list
            cur_user_list_ids = set([])
            for scn_name in profiles_to_collect.union(place_savers["cur_user_list"]):
                if scn_name in all_screennames.keys():
                    jfid = open(os.path.join(save_dir["twitter_profiles"], all_screennames[scn_name]))
                    profile = ujson.load(jfid)
                    jfid.close()
                    if "id" in profile:
                        cur_user_list_ids.add(int(profile["id"]))
                    else:
                        place_savers["unavailable_accounts"].add(scn_name)
                else:
                    place_savers["unavailable_accounts"].add(scn_name)
            print cur_user_list_ids
            del profiles_to_collect
            place_savers["cur_user_list"] = set(cur_user_list_ids)
        else:
            # Collect and save profiles
            profiles_to_collect = set(place_savers["cur_user_list"]).difference(
                set(map(int, place_savers["finished_users"].keys()))
            )
            user_info = pyTweet.user_lookup_userids(user_list=list(profiles_to_collect), proxies=proxies, auth=auth)
            for udata in user_info:
                json_filename = _save_profile_json(
                    profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop
                )
                place_savers["finished_users"][udata["id"]] = json_filename
            # Update current user list, and identify unavailable accounts
            new_cur_user_list = set([])
            for uid in profiles_to_collect.union(set(place_savers["cur_user_list"])):
                if uid in place_savers["unavailable_accounts"]:
                    continue
                if uid in place_savers["finished_users"].keys():
                    new_cur_user_list.add(uid)
                else:
                    place_savers["unavailable_accounts"].add(uid)
            place_savers["cur_user_list"] = set(new_cur_user_list)
            del new_cur_user_list

        # Save place saving variables
        growth_params["h{}_users.json".format(khop)] = set(place_savers["cur_user_list"])
        save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)
        save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)

        # Get timelines for each user in user_info
        print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS"
        for uid in place_savers["cur_user_list"]:
            if uid in place_savers["finished_users"].keys():
                profile_filename = place_savers["finished_users"][uid]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                try:
                    tldata = ujson.load(open(profile_filename, "r"))
                    tldata["id"]
                except (IOError, KeyError):
                    # Collect user data
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][uid] = json_filename
                if ("has_timeline" in tldata.keys()) and (tldata["has_timeline"] is True):
                    continue
                if not os.path.isfile(timeline_filename):
                    print "Collect the timeline for user {}.".format(uid)
                    tldata = pyTweet.collect_user_timeline(
                        USER=uid, USER_type="user_id", start_date=timeline_start_date, proxies=proxies, auth=auth
                    )
                    for tl in range(len(tldata)):
                        tldata[tl]["DOC"] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z")
                        tldata[tl]["has_timeline"] = True
                    fast_save(filename=profile_filename, obj=tldata)

        print "\nGet friends of each user in cur_user_list"
        if hop_limits["friends"] != 0:
            growth_params["h{}_friends.json".format(khop)] = set([])
            print "\nCOLLECT FRIENDS OF CURRENT USER SET"
            # print "place_savers['cur_user_list']: ", place_savers['cur_user_list']
            for jj in place_savers["cur_user_list"]:
                profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj])
                try:
                    data = ujson.load(open(profile_filename, "r"))
                    # print data['id']
                except (IOError, KeyError, TypeError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop)
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][uid] = json_filename
                if data["friends_count"] < 1:
                    data["friends_list"] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if "friends_list" not in data.keys():
                    print "Collect friends for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj, limit=hop_limits["friends"], proxies=proxies, auth=auth
                    )
                    data["friends_list"] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits["friends"] < len(data["friends_list"]):
                    place_savers["next_user_list"].update(set(data["friends_list"][0 : len(hop_limits["friends"]) - 1]))
                else:
                    place_savers["next_user_list"].update(set(data["friends_list"]))
                growth_params["h{}_friends.json".format(khop)].update(set(data["friends_list"]))
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        print "\nGet followers of each user in the cur_user_list"
        if hop_limits["followers"] != 0:
            growth_params["h{}_followers.json".format(khop)] = set([])
            print "\nCOLLECT FOLLOWERS OF CURRENT USER SET"
            for jj in place_savers["cur_user_list"]:
                # profile_filename = place_savers['finished_users'][jj]
                profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj])
                try:
                    data = ujson.load(open(profile_filename, "r"))
                    data["id"]
                except (IOError, KeyError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop)
                if data["followers_count"] < 1:
                    data["followers_list"] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if "followers_list" not in data.keys():
                    print "Collect followers for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj, limit=hop_limits["followers"], proxies=proxies, auth=auth
                    )
                    data["followers_list"] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits["followers"] < len(data["followers_list"]):
                    place_savers["next_user_list"].update(
                        set(data["followers_list"][0 : len(hop_limits["followers"]) - 1])
                    )
                else:
                    place_savers["next_user_list"].update(set(data["followers_list"]))
                growth_params["h{}_followers.json".format(khop)].update(set(data["followers_list"]))
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Pull out user mentions
        if ("user_mention_id" in hop_limits) and (hop_limits["user_mention_id"] != 0):
            print "\nCOLLECT USER MENTIONS OF CURRENT SET"
            growth_params["h{}_user_mentions.json".format(khop)] = set([])
            for jj in place_savers["cur_user_list"]:
                profile_filename = place_savers["finished_users"][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, "r"))
                        if len(tldata) < 1:
                            continue
                        tldata[0]["text"]
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(
                            user_id=jj,
                            filename=timeline_filename,
                            start_date=timeline_start_date,
                            proxies=proxies,
                            auth=auth,
                        )
                        tldata = ujson.load(open(timeline_filename, "r"))
                else:
                    # Get the timeline data
                    _save_timeline_json(
                        user_id=jj,
                        filename=timeline_filename,
                        start_date=timeline_start_date,
                        proxies=proxies,
                        auth=auth,
                    )
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                # Pull out user mentions
                tl_mentions = pyTweet.pull_timeline_entitites(
                    timeline=tldata, type="user_mention_id", limit=hop_limits["user_mention_id"]
                )
                growth_params["h{}_user_mentions.json".format(khop)].update(tl_mentions)
                place_savers["next_user_list"].update(tl_mentions)
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Pull out replies
        if ("in_reply_to_user_id" in hop_limits) and (hop_limits["in_reply_to_user_id"] != 0):
            print "\nCOLLECT USERS CURRENT SET REPLIES TO"
            growth_params["h{}_replies.json".format(khop)] = set([])
            for jj in place_savers["cur_user_list"]:
                profile_filename = place_savers["finished_users"][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, "r"))
                        tldata[0]["text"]
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(
                            user_id=jj,
                            filename=timeline_filename,
                            start_date=timeline_start_date,
                            proxies=proxies,
                            auth=auth,
                        )
                        tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                else:
                    # Get the timeline data
                    _save_timeline_json(
                        user_id=jj,
                        filename=timeline_filename,
                        start_date=timeline_start_date,
                        proxies=proxies,
                        auth=auth,
                    )
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                # Pull out replies
                tl_replies = pyTweet.pull_timeline_entitites(
                    timeline=tldata, type="in_reply_to_user_id", limit=hop_limits["in_reply_to_user_id"]
                )
                place_savers["next_user_list"].update(tl_replies)
                growth_params["h{}_replies.json".format(khop)].update(tl_replies)
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Check data limit
        if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None):
            data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"])
            if data_vol > hop_limits["max_data"]:
                print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                    data_vol, hop_limits["max_data"]
                )
                return
        # Prepare for next iteration
        place_savers["cur_hop"] = khop + 1
        place_savers["cur_user_list"] = set(place_savers["next_user_list"])
        place_savers["next_user_list"] = set([])
        save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
Example #3
0
def _get_profiles_wrapper(cur, conn, user_list, proxies, auth, list_type, hop):
    """
    This function is a wrapper for grabbing user profiles

    :param cur: Cursor to database
    :param conn: Connection to database
    :param user_list: List of Twitter user IDs
    :param list_type: must be 'ID' for user IDs or 'SN' for screen names
    :param get_profiles: Set object of users to collect
    """
    assert ((list_type == 'user_id') or (list_type == 'screen_name')), "The parameter must list_type must be set to either 'user_id' or 'screen_name'"
    db_user_ids = set([])       # List of redundant profiles
    # Filter out profiles that have already been collected
    cur.execute("SELECT DISTINCT {} FROM users;".format(list_type))
    q = cur.fetchall()      # q = [('sn',), ..., ('sn2',)]
    for ii in q:
        db_user_ids.add(ii[0])
    # Filter out deleted or protected profiles
    cur.execute("SELECT DISTINCT {} FROM lost_profiles;".format(list_type))
    q = cur.fetchall()
    for ii in q:
        db_user_ids.add(ii[0])
    get_profiles = list(set(user_list).difference(db_user_ids))
    if get_profiles is None:
        return
    # Partition IDs
    USERS = [get_profiles[z:z+100] for z in range(0, len(get_profiles), 100)]
    get_profiles = set(get_profiles)
    # del get_profiles    # Save space
    for j in range(len(USERS)):
        # Look up information of users, 100 at a time
        print "\tLook up profile information for up to 100 users at a time"
        if list_type == 'screen_name':
            user_info = pyTweet.user_lookup_usernames(user_list=list(USERS[j]), proxies=proxies, auth=auth)
            lost_cmd = "INSERT INTO lost_profiles (screen_name) VALUES ('{}');"
        elif list_type == 'user_id':
            user_info = pyTweet.user_lookup_userids(user_list=list(USERS[j]), proxies=proxies, auth=auth)
            lost_cmd = "INSERT INTO lost_profiles (user_id) VALUES ({});"
        else:
            print "The type '{}' is not recognized. Set list_type to either 'user_id' or 'screen_name'".format(list_type)
            return
        # Are there profiles that are either protected/deleted?
        if (not isinstance(user_info, list)) and ('errors' in user_info.keys()) and (user_info['errors'][0]['code'] == 17):
            for u in USERS[j]:
                json_to_database.make_sql_edit(cur, conn, lost_cmd.format(u))
                get_profiles.remove(u)
            return
        if len(user_info) < len(USERS[j]):
            for u in USERS[j]:
                profile_collected = False
                for ui in user_info:
                    if (u in ui.values()) or (str(u) in ui.values()):
                        profile_collected = True
                        break
                # Add profile to table deleted_profiles if necessary
                if not profile_collected:
                    json_to_database.make_sql_edit(cur, conn, lost_cmd.format(u))
                    get_profiles.remove(u)
        # Add user info to database
        for k in user_info:
            if k == 'errors':
                continue
            k['khop'] = hop
            k['DOC'] = datetime.datetime.utcnow()
            if hop < 1:
                k['expand_user'] = True
            json_to_database.add_user(userdata=k, cur=cur, conn=conn)