Example #1
0
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}):
    """
    This function creates a network based on Twitter friends

    :param user_seed: List of user names
    :param host: Your host IP
    :param port: Your port
    :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection
    :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory.
                          EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons',
                                          'twitter_timelines': '/dir/to/save/timeline/jsons'}

    :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum
                          number of hops to make the graph with 'max_hops', then decide the maximum amount of data to
                          collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to
                          'None' if you don't want to limit the amount of data collected. Next, set limits (per
                          individual) on how many friends, followers, replied to users, and mentioned users to include
                          in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you
                          do not wish to limit the collection, and will expand the graph on as many as these edges as
                          possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note
                          that friends and followers will be saved in the fields 'friends_list' and 'followers_list'
                          automatically. The reply and mention users are saved in timelines.
                          EX.hop_limits = {'max_hops': 2,              # Maximin number of hops in graph
                                                'max_data': None,           # Maximum amount of data (in GB)
                                                'friends': 0,               # Maximum friends per user to include in next hop
                                                'followers': None,          # Maximum followers per user to include in next hop
                                                'in_reply_to_user_id': 17,  # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop
                                                'user_mention_id': 21}      # Maximum 'user_mention_id' per user's timeline to include in next hop
    """
    # CHECK PARAMETERS
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ("twitter_profiles" not in save_dir.keys()) or (save_dir["twitter_profiles"].strip() == ""):
        save_dir["twitter_profiles"] = os.path.join(os.getcwd(), "profiles")
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(
            save_dir["twitter_profiles"]
        )
    if not os.path.isdir(save_dir["twitter_profiles"]):
        print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_profiles"])
        os.mkdir(save_dir["twitter_profiles"])
    if ("twitter_timelines" not in save_dir.keys()) or (save_dir["twitter_timelines"].strip() == ""):
        save_dir["twitter_timelines"] = os.path.join(os.getcwd(), "timelines")
        print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(
            save_dir["twitter_timelines"]
        )
    if not os.path.isdir(save_dir["twitter_timelines"]):
        print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_timelines"])
        os.mkdir(save_dir["twitter_timelines"])
    # Checkhop_limits dictionary
    hop_limits_defaults = {
        "max_hops": [6, "the maximin number of hops in graph"],
        "max_data": [2, "the maximin amount of data collected (in GB)"],
        "friends": [0, "max friends per user to include in next hop"],
        "followers": [0, "max followers per user to include in next hop"],
        "in_reply_to_user_id": [0, "max 'in_reply_to_user_id' per user's timeline to include in next hop"],
        "user_mention_id": [0, "max 'user_mention_id' per user's timeline to include in next ho"],
    }
    for kk in hop_limits_defaults.keys():
        if kk not in hop_limits:
            hop_limits[kk] = hop_limits_defaults[kk][0]
            print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format(
                kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0]
            )
    # Check data amount and quit if graph has reached limit
    if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None):
        data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"])
        if data_vol > hop_limits["max_data"]:
            print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                data_vol, hop_limits["max_data"]
            )
            return

    # DETERMINE COLLECTION PARAMETERS
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = load_place_savers(save_dir["twitter_profiles"])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(
        len(place_savers["finished_users"]), save_dir["twitter_profiles"]
    )
    print "\tThe current hop is {}".format(place_savers["cur_hop"])
    if place_savers["cur_hop"] < 1:
        place_savers["cur_user_list"] = set(user_seed)
    save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
    print "\tWe will collect {} users in hop {}".format(len(place_savers["cur_user_list"]), place_savers["cur_hop"])
    # Load growth parametes
    growth_params = load_growth_params(save_dir["twitter_profiles"])
    # Create proxies dictionary
    proxies = {"http": "http://%s:%s" % (host, port), "https": "http://%s:%s" % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # API authorization
    auth = pyTweet.get_authorization(twitter_keys)

    # BUILD THE GRAPH
    print "\nStart building the graph!"
    for khop in range(place_savers["cur_hop"], hop_limits["max_hops"]):
        print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(
            khop, len(place_savers["cur_user_list"])
        )
        print "Create the user list of the {}th-hop users as well.".format(khop + 1)

        # Get profile information of users in cur_user_list
        print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS"
        if khop < 1:
            # Find profiles to collect
            profiles_to_collect = set(place_savers["cur_user_list"])
            for json_filename in place_savers["finished_users"].values():
                data = ujson.load(open(os.path.join(save_dir["twitter_profiles"], json_filename), "r"))
                if data["screen_name"] in place_savers["cur_user_list"]:
                    profiles_to_collect.discard(data["screen_name"])
            # Collect and save profiles
            user_info = []
            if len(profiles_to_collect) > 0:
                print "\nstart collecting profiles: {} profiles".format(len(profiles_to_collect))
                user_info = pyTweet.user_lookup_usernames(
                    user_list=list(profiles_to_collect), proxies=proxies, auth=auth
                )
                if isinstance(user_info, dict) and ("errors" in user_info.keys()):
                    print "\nThe initial seed cannot be collected..."
                    print "Twitter error message: ", user_info
                # Save profile information
                # print "user_info: ", user_info
                # print type(user_info)
                for udata in user_info:
                    # print "udata: ", udata
                    # print type(udata)
                    json_filename = _save_profile_json(
                        profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][udata["id"]] = json_filename
            # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well
            all_screennames = {}  # Keys are screen names and values are file name
            jsons = filter(lambda k: re.match("userInfo_*", k), os.listdir(save_dir["twitter_profiles"]))
            for jj in jsons:
                try:
                    full_filename = os.path.join(save_dir["twitter_profiles"], jj)
                    if os.path.getsize(full_filename) != 0:
                        jfid = open(full_filename)
                        profile = ujson.load(jfid)
                        jfid.close()
                        all_screennames[profile["screen_name"]] = jj
                except ValueError:
                    continue
            # Get corresponding user IDs for each screen name in cur_user_list
            cur_user_list_ids = set([])
            for scn_name in profiles_to_collect.union(place_savers["cur_user_list"]):
                if scn_name in all_screennames.keys():
                    jfid = open(os.path.join(save_dir["twitter_profiles"], all_screennames[scn_name]))
                    profile = ujson.load(jfid)
                    jfid.close()
                    if "id" in profile:
                        cur_user_list_ids.add(int(profile["id"]))
                    else:
                        place_savers["unavailable_accounts"].add(scn_name)
                else:
                    place_savers["unavailable_accounts"].add(scn_name)
            print cur_user_list_ids
            del profiles_to_collect
            place_savers["cur_user_list"] = set(cur_user_list_ids)
        else:
            # Collect and save profiles
            profiles_to_collect = set(place_savers["cur_user_list"]).difference(
                set(map(int, place_savers["finished_users"].keys()))
            )
            user_info = pyTweet.user_lookup_userids(user_list=list(profiles_to_collect), proxies=proxies, auth=auth)
            for udata in user_info:
                json_filename = _save_profile_json(
                    profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop
                )
                place_savers["finished_users"][udata["id"]] = json_filename
            # Update current user list, and identify unavailable accounts
            new_cur_user_list = set([])
            for uid in profiles_to_collect.union(set(place_savers["cur_user_list"])):
                if uid in place_savers["unavailable_accounts"]:
                    continue
                if uid in place_savers["finished_users"].keys():
                    new_cur_user_list.add(uid)
                else:
                    place_savers["unavailable_accounts"].add(uid)
            place_savers["cur_user_list"] = set(new_cur_user_list)
            del new_cur_user_list

        # Save place saving variables
        growth_params["h{}_users.json".format(khop)] = set(place_savers["cur_user_list"])
        save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)
        save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)

        # Get timelines for each user in user_info
        print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS"
        for uid in place_savers["cur_user_list"]:
            if uid in place_savers["finished_users"].keys():
                profile_filename = place_savers["finished_users"][uid]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                try:
                    tldata = ujson.load(open(profile_filename, "r"))
                    tldata["id"]
                except (IOError, KeyError):
                    # Collect user data
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][uid] = json_filename
                if ("has_timeline" in tldata.keys()) and (tldata["has_timeline"] is True):
                    continue
                if not os.path.isfile(timeline_filename):
                    print "Collect the timeline for user {}.".format(uid)
                    tldata = pyTweet.collect_user_timeline(
                        USER=uid, USER_type="user_id", start_date=timeline_start_date, proxies=proxies, auth=auth
                    )
                    for tl in range(len(tldata)):
                        tldata[tl]["DOC"] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z")
                        tldata[tl]["has_timeline"] = True
                    fast_save(filename=profile_filename, obj=tldata)

        print "\nGet friends of each user in cur_user_list"
        if hop_limits["friends"] != 0:
            growth_params["h{}_friends.json".format(khop)] = set([])
            print "\nCOLLECT FRIENDS OF CURRENT USER SET"
            # print "place_savers['cur_user_list']: ", place_savers['cur_user_list']
            for jj in place_savers["cur_user_list"]:
                profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj])
                try:
                    data = ujson.load(open(profile_filename, "r"))
                    # print data['id']
                except (IOError, KeyError, TypeError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop)
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop
                    )
                    place_savers["finished_users"][uid] = json_filename
                if data["friends_count"] < 1:
                    data["friends_list"] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if "friends_list" not in data.keys():
                    print "Collect friends for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj, limit=hop_limits["friends"], proxies=proxies, auth=auth
                    )
                    data["friends_list"] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits["friends"] < len(data["friends_list"]):
                    place_savers["next_user_list"].update(set(data["friends_list"][0 : len(hop_limits["friends"]) - 1]))
                else:
                    place_savers["next_user_list"].update(set(data["friends_list"]))
                growth_params["h{}_friends.json".format(khop)].update(set(data["friends_list"]))
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        print "\nGet followers of each user in the cur_user_list"
        if hop_limits["followers"] != 0:
            growth_params["h{}_followers.json".format(khop)] = set([])
            print "\nCOLLECT FOLLOWERS OF CURRENT USER SET"
            for jj in place_savers["cur_user_list"]:
                # profile_filename = place_savers['finished_users'][jj]
                profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj])
                try:
                    data = ujson.load(open(profile_filename, "r"))
                    data["id"]
                except (IOError, KeyError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth)
                    if (user_info is not dict) or ("id" not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop)
                if data["followers_count"] < 1:
                    data["followers_list"] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if "followers_list" not in data.keys():
                    print "Collect followers for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj, limit=hop_limits["followers"], proxies=proxies, auth=auth
                    )
                    data["followers_list"] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits["followers"] < len(data["followers_list"]):
                    place_savers["next_user_list"].update(
                        set(data["followers_list"][0 : len(hop_limits["followers"]) - 1])
                    )
                else:
                    place_savers["next_user_list"].update(set(data["followers_list"]))
                growth_params["h{}_followers.json".format(khop)].update(set(data["followers_list"]))
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Pull out user mentions
        if ("user_mention_id" in hop_limits) and (hop_limits["user_mention_id"] != 0):
            print "\nCOLLECT USER MENTIONS OF CURRENT SET"
            growth_params["h{}_user_mentions.json".format(khop)] = set([])
            for jj in place_savers["cur_user_list"]:
                profile_filename = place_savers["finished_users"][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, "r"))
                        if len(tldata) < 1:
                            continue
                        tldata[0]["text"]
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(
                            user_id=jj,
                            filename=timeline_filename,
                            start_date=timeline_start_date,
                            proxies=proxies,
                            auth=auth,
                        )
                        tldata = ujson.load(open(timeline_filename, "r"))
                else:
                    # Get the timeline data
                    _save_timeline_json(
                        user_id=jj,
                        filename=timeline_filename,
                        start_date=timeline_start_date,
                        proxies=proxies,
                        auth=auth,
                    )
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                # Pull out user mentions
                tl_mentions = pyTweet.pull_timeline_entitites(
                    timeline=tldata, type="user_mention_id", limit=hop_limits["user_mention_id"]
                )
                growth_params["h{}_user_mentions.json".format(khop)].update(tl_mentions)
                place_savers["next_user_list"].update(tl_mentions)
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Pull out replies
        if ("in_reply_to_user_id" in hop_limits) and (hop_limits["in_reply_to_user_id"] != 0):
            print "\nCOLLECT USERS CURRENT SET REPLIES TO"
            growth_params["h{}_replies.json".format(khop)] = set([])
            for jj in place_savers["cur_user_list"]:
                profile_filename = place_savers["finished_users"][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, "r"))
                        tldata[0]["text"]
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(
                            user_id=jj,
                            filename=timeline_filename,
                            start_date=timeline_start_date,
                            proxies=proxies,
                            auth=auth,
                        )
                        tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                else:
                    # Get the timeline data
                    _save_timeline_json(
                        user_id=jj,
                        filename=timeline_filename,
                        start_date=timeline_start_date,
                        proxies=proxies,
                        auth=auth,
                    )
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, "r"))
                    if len(tldata) < 1:
                        continue
                # Pull out replies
                tl_replies = pyTweet.pull_timeline_entitites(
                    timeline=tldata, type="in_reply_to_user_id", limit=hop_limits["in_reply_to_user_id"]
                )
                place_savers["next_user_list"].update(tl_replies)
                growth_params["h{}_replies.json".format(khop)].update(tl_replies)
                save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop)

        # Check data limit
        if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None):
            data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"])
            if data_vol > hop_limits["max_data"]:
                print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                    data_vol, hop_limits["max_data"]
                )
                return
        # Prepare for next iteration
        place_savers["cur_hop"] = khop + 1
        place_savers["cur_user_list"] = set(place_savers["next_user_list"])
        place_savers["next_user_list"] = set([])
        save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
Example #2
0
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_out_limits={}, collection_limits={}):
    """
    This function creates a network based on Twitter friends

    @param user_seed           - List of user names
    @param host                -
    @param port                -
    @param timeline_start_date - Beginning of date (datetime.date object) of timelines in collection
    @param save_dir     - Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory.
                          EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons',
                                          'twitter_timelines': '/dir/to/save/timeline/jsons'}
    @param hop_limits   - Specify your graph constrains with the variable hop_out_limits. First determine the maximum
                          number of hops to make the graph with 'max_hops', then decide the maximum amount of data to
                          collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to
                          'None' if you don't want to limit the amount of data collected. Next, set limits (per
                          individual) on how many friends, followers, replied to users, and mentioned users to include
                          in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you
                          do not wish to limit the collection, and will expand the graph on as many as these edges as
                          possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note
                          that friends and followers will be saved in the fields 'friends_list' and 'followers_list'
                          automatically. The reply and mention users are saved in timelines.
                          EX. hop_out_limits = {'max_hops': 2,              # Maximin number of hops in graph
                                                'max_data': None,           # Maximum amount of data (in GB)
                                                'friends': 0,               # Maximum friends per user to include in next hop
                                                'followers': None,          # Maximum followers per user to include in next hop
                                                'in_reply_to_user_id': 17,  # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop
                                                'user_mention_id': 21}      # Maximum 'user_mention_id' per user's timeline to include in next hop

    @param collection_limits - Suppose that you want to store friends or followers, but do not want to expand the graph
                    based on them. Specify limitations on collecting friends and followers below. Notice that reply and mention users
                    are saved in the timelines. The largest possible length of 'friends_list' will be the greater of hops out limit and
                    collection limit, or MAX(hops_out_limit['friends'], collection_limits['friends']). The same description follows for
                    'followers_list'.
                    EX. collection_limits = {'friends': 0,      # Maximum number of friends per user to save within the profile .JSON
                                         'followers': None}     # Maximum number of followers per user to save within the profile .JSON
    """
    # CHECK PARAMETERS
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''):
        save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles')
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles'])
    if not os.path.isdir(save_dir['twitter_profiles']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles'])
        os.mkdir(save_dir['twitter_profiles'])
    if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''):
        save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines')
        print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(save_dir['twitter_timelines'])
    if not os.path.isdir(save_dir['twitter_timelines']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_timelines'])
        os.mkdir(save_dir['twitter_timelines'])
    # Check data amount and quit if graph has reached limit
    if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None):
        data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines'])
        if (data_vol > hop_out_limits['max_data']):
            print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data'])
            return
    # Check hop_out_limits dictionary
    if 'max_hops' not in hop_out_limits:
        hop_out_limits['max_hops'] = 6
        print "\tNo value was specified for hop_out_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_out_limits['max_hops'])
    if 'max_data' not in hop_out_limits:
        hop_out_limits['max_data'] = 2
        print "\tNo value was specified for hop_out_limits['max_data'], the maximin amount of data collected (in GB), so it will be set to {}.".format(hop_out_limits['max_data'])
    if 'friends' not in hop_out_limits:
        hop_out_limits['friends'] = 0
        print "\tNo value was specified for hop_out_limits['friends'], max friends per user to include in next hop, so it will be set to 0."
    if 'followers' not in hop_out_limits:
        hop_out_limits['followers'] = 0
        print "\tNo value was specified for hop_out_limits['followers'], max followers per user to include in next hop, so it will be set to 0."
    if 'in_reply_to_user_id' not in hop_out_limits:
        hop_out_limits['in_reply_to_user_id'] = 0
        print "\tNo value was specified for hop_out_limits['in_reply_to_user_id'], max 'in_reply_to_user_id' per user's timeline to include in next hop, so it will be set to 0."
    if 'user_mention_id' not in hop_out_limits:
        hop_out_limits['user_mention_id'] = 0
        print "\tNo value was specified for hop_out_limits['user_mention_id'], max 'user_mention_id' per user's timeline to include in next hop, so it will be set to 0."
    # Check collection_limits dictionary
    if 'friends' not in collection_limits:
        collection_limits['friends'] = 0
        print "\tNo value was specified for collection_limits['friends'], max number of friends per user to save with the profile .JSON, so it will be set to 0."
    if 'followers' not in collection_limits:
        collection_limits['followers'] = 0
        print "\tNo value was specified for collection_limits['followers'], max number of followers per user to save with the profile .JSON, so it will be set to 0."
    # DETERMINE COLLECTION PARAMETERS
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = load_place_savers(save_dir['twitter_profiles'])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles'])
    print "\tThe current hop is {}".format(place_savers['cur_hop'])
    if place_savers['cur_hop'] < 1:
        place_savers['cur_user_list'] = set(user_seed)
    print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop'])
    print "\tSo far we plan to collect {} users in hop {}".format(len(place_savers['next_user_list']), place_savers['cur_hop'] + 1)
    # Determine limits for friends/followers collection -
    if None in [hop_out_limits['friends'], collection_limits['friends']]:
        MAX_FRIENDS = None
    else:
        MAX_FRIENDS = max(hop_out_limits['friends'], collection_limits['friends'])
    if None in [hop_out_limits['followers'], collection_limits['followers']]:
        MAX_FOLLOWERS = None
    else:
        MAX_FOLLOWERS = max(hop_out_limits['followers'], collection_limits['followers'])
    # Create proxies dictionary
    proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # API AUTHORIZATION
    print "\nAPI Authorization"
    auth = pyTweet.get_authorization(twitter_keys)
    # BUILD THE GRAPH
    print "\nStart building the graph!"
    for i in range(place_savers['cur_hop'], hop_out_limits['max_hops']):
        print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(i, len(place_savers['cur_user_list']))
        print "Create the user list of the " + str(i+1) + "th-hop users."
        # Remove finished_users from next_user_list
        if (place_savers['cur_hop'] > 0):
            place_savers['cur_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys())))
        # Separate list for faster results, and delete place_savers['cur_user_list'] to free space
        USERS = [list(place_savers['cur_user_list'])[z:z+100] for z in range(0, len(place_savers['cur_user_list']), 100)]
        del place_savers['cur_user_list']   # save space
        for j in range(len(USERS)):
            # Look up information of users, 100 at a time
            print "\tLook up user information"
            if i < 1:
                # The initial list contain user names or @handles
                user_info = pyTweet.user_lookup_usernames(user_list=USERS[j], proxies=proxies, auth=auth)
                USERS[j] = set([])
                for jj in range(len(user_info)):
                    USERS[j].add(int(str(user_info[jj]['id'])))
            else:
                # All other lists will contain user ids
                user_info = pyTweet.user_lookup_userids(user_list=USERS[j], proxies=proxies, auth=auth)
            # Get friends, followers, and timelines of each user in user_info
            for k in range(len(user_info)):
                id = str(user_info[k]['id'])
                # Check to see that the user's friend/follower list hasn't already been collected
                if id in place_savers['finished_users'].keys():
                    # Load previously saved user data
                    pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_' + str(place_savers['finished_users'][id]) + '.json')
                    if os.path.getsize(pro_filename) == 0:
                        # File exists but it is empty
                        user_data = user_info[k]
                        user_data['khop'] = i
                        user_data['DOC'] = datetime.datetime.utcnow()
                        fast_save(filename=pro_filename, obj=user_data)
                    else:
                        try:
                            # Open and read profile .json
                            jfid = open(pro_filename)
                            user_data = ujson.load(jfid)
                            user_data['DOC'] = datetime.datetime.utcnow()
                            jfid.close()
                        except ValueError:
                            # Fail at opening profile .json, resave it
                            user_data = user_info[k]
                            user_data['khop'] = i
                            user_data['DOC'] = datetime.datetime.utcnow()
                            fast_save(filename=pro_filename, obj=user_data)
                else:
                    # The user's profile has not been collected...start now
                    place_savers['finished_users'][id] = str(uuid.uuid4())
                    pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_{}.json'.format(str(place_savers['finished_users'][id])))
                    # Add user information: hop, DOC
                    user_data = user_info[k]
                    user_data['khop'] = i
                    user_data['DOC'] = datetime.datetime.utcnow()
                    fast_save(filename=pro_filename, obj=user_data)
                print "\tSaved user {} information in {}.".format(id, pro_filename)
                # Collect user friends
                if 'friends_list' not in user_data:
                    friends_list = []
                    if (user_data['friends_count'] > 0) and ((MAX_FRIENDS is None) or (MAX_FRIENDS > 0)):
                        print "\tCollect friends for user {}.".format(id)
                        friends_list = pyTweet.get_user_friends(user_id=id, limit=MAX_FRIENDS, proxies=proxies, auth=auth)
                    user_data['friends_list'] = friends_list
                    fast_save(filename=pro_filename, obj=user_data)
                place_savers['next_user_list'].difference_update(set(user_data['friends_list'][0:hop_out_limits['friends']]))    # Add friends to next_user_list
                save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Collect user followers
                if 'followers_list' not in user_data:
                    followers_list = []
                    if (user_data['followers_count'] > 0) and ((MAX_FOLLOWERS is None) or (MAX_FOLLOWERS > 0)):
                        print "\tCollect followers for user {}.".format(id)
                        followers_list = pyTweet.get_user_followers(user_id=id, limit=MAX_FOLLOWERS, proxies=proxies, auth=auth)
                    user_data['followers_list'] = followers_list
                    fast_save(filename=pro_filename, obj=user_data)
                place_savers['next_user_list'].difference_update(set(user_data['followers_list'][0:hop_out_limits['followers']]))  # Add followers to next_user_list
                save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Collect timeline for user beginning from start_date
                tl_filename = os.path.join(save_dir['twitter_timelines'], 'timeline_{}.json'.format(place_savers['finished_users'][id]))
                if os.path.isfile(tl_filename):
                    print "\tThe timeline for user {} has already been collected.".format(id)
                    # Load timeline file
                    if os.path.getsize(tl_filename) == 0: continue      # Skip empty time lines
                    try:
                        jfid = open(tl_filename)
                        tldata = ujson.load(jfid)
                        jfid.close()
                    except (IOError, ValueError):
                        # Fail at opening file, recollect time line
                        print "\tCollect the timeline for user {}.".format(id)
                        tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth)
                        for tl in range(len(tldata)):
                            tldata[tl]['DOC'] = datetime.datetime.utcnow()
                        fast_save(filename=tl_filename, obj=tldata)
                else:
                    print "\tCollect the timeline for user {}.".format(id)
                    tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth)
                    for tl in range(len(tldata)):
                        tldata[tl]['DOC'] = datetime.datetime.utcnow()
                    fast_save(filename=tl_filename, obj=tldata)

                # Pull out user mentions, if applicable
                if ('user_mention_id' in hop_out_limits) and ((hop_out_limits['user_mention_id'] > 0) or (hop_out_limits['user_mention_id'] is None)):
                    print "\tAdd user mentions to the next hop"
                    tl_mentions = pyTweet.pull_timeline_entitites(timeline=tldata, type='user_mention_id', limit=hop_out_limits['user_mention_id'])
                    place_savers['next_user_list'].update(tl_mentions)
                    save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Pull out user replies, if applicable
                if ('in_reply_to_user_id' in hop_out_limits) and ((hop_out_limits['in_reply_to_user_id'] > 0) or (hop_out_limits['in_reply_to_user_id'] is None)):
                    print "\tAdd replies to the next hop"
                    tl_replies = pyTweet.pull_timeline_entitites(timeline=tldata, type='in_reply_to_user_id', limit=hop_out_limits['in_reply_to_user_id'])
                    place_savers['next_user_list'].update(tl_replies)
                    save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
                # Check data amount and quit if graph has reached limit
                if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None):
                    data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines'])
                    if (data_vol > hop_out_limits['max_data']):
                        print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data'])
                        return
        # Remove finished_users from place_savers['next_user_list']
        place_savers['next_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys())))
        # Prepare for next iteration of hop
        place_savers['cur_user_list'] = place_savers['next_user_list']
        place_savers['next_user_list'] = set([])
        place_savers['cur_hop'] += 1
        save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)
        print "There are ", len(place_savers['cur_user_list']), " users in the next iteration of users."
    print "\nDone building graph!"
Example #3
0
def depth_first_cascade_search(user_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}):
    """
    This funciton builds a network based on users relevant to seed keywords
    Requires that a PostgreSQL database already exist

    :param user_seed: List of user names
    :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection
    :param tl_end_date: End date (datetime.date object) of timelines in collection
    :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database
    :param host:
    :param port:
    :param save_dir: Directory storing sampling place savers and growth parameters
                    EX. save_dir = {'place_saver_filename': 'name of file'}
    :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to
                       make a graph with 'max_hops'.
                          EX. hop_limits = {'max_hops': 2}              # Maximum number of hops in graph
    """
    # CHECK PARAMETERS
    print "\nCheck parameters"
    # Timeline start and end dates
    assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))."
    assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date."
    # Check PostgreSQL parameters
    assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'."
    try:
        conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values())))
        cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    except psycopg2.OperationalError:
        print "OperationalError: Check your login credentials.  Make sure the database exists as well."
        return
    # Check hop_limits dictionary
    if 'max_hops' not in hop_limits:
        hop_limits['max_hops'] = 5
        print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops'])
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''):
        save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles')
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles'])
    if not os.path.isdir(save_dir['twitter_profiles']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles'])
        os.mkdir(save_dir['twitter_profiles'])

    # SET UP SECONDARY PARAMETERS
    # Create proxies dictionary
    proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles'])
    print "\tThe current hop is {}".format(place_savers['cur_hop'])
    if place_savers['cur_hop'] < 1:
        place_savers['cur_user_list'] = set(user_seed)
    print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop'])
    breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
    # Load growth parameters
    growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles'])

    # API AUTHORIZATION
    print "\nAPI Authorization"
    OAUTH = pyTweet.get_authorization(twitter_keys)

    # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS
    print "\nConfigure database for TF-IDF analysis"
    json_to_database.configure_database_to_build_network(cur, conn)
    new_columns = [{'table': 'users', 'col': 'decision_tfidf', 'type': 'FLOAT'}]    # used
    for i in new_columns:
        try:
            json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type']))
        except psycopg2.ProgrammingError:
            conn.rollback()

    # SAMPLING LOOP
    print "\nBegin collection"
    cur_hop = place_savers['cur_hop']
    for ii in range(cur_hop, hop_limits['max_hops']):
        print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list']))
        if ii < 1:
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii)
            # Replace user names in place_savers['cur_user_list'] with user IDs!
            user_id_set = set([])
            for jj in place_savers['cur_user_list']:
                cur.execute("SELECT user_id FROM users WHERE screen_name = '{}';".format(jj))
                user_id_set.add(cur.fetchone()[0])
            place_savers['cur_user_list'] = set(user_id_set)
            del user_id_set
            breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
        else:
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii)
        # Do not expand users who have more than 1000 friends+followers
        json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);")

        # GET TIMELINES
        for jj in place_savers['cur_user_list']:
            _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH)
        # Add all of the hashtags from the seed of users
        if ii < 1:
            total_ht_h0 = 0
            print "\nAdd all of the hashtags from the seed of users"
            json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (khop=0) AND (expand_user IS NULL);")
            cur.execute("SELECT DISTINCT user_id FROM users WHERE (khop=0) AND (expand_user IS TRUE);")
            q = cur.fetchall()
            for qq in q:
                nAdd = _add_all_hashtags(cur=cur, conn=conn, user_id=qq[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii)
                total_ht_h0 = total_ht_h0 + nAdd
            print "Added {} hashtags from hop {}.".format(total_ht_h0, ii)
            cur.execute("SELECT COUNT(*) FROM users WHERE (khop=0) AND (expand_user=TRUE);")
            print "Expand {} users from hop {}".format(cur.fetchone()[0], ii)

        # SAVE GRAPH PARAMS
        growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list'])
        growth_params['h{}_missing.json'.format(ii)] = set([])
        growth_params['h{}_extendTRUE.json'.format(ii)] = set([])
        growth_params['h{}_extendFALSE.json'.format(ii)] = set([])
        growth_params['h{}_extendNULL.json'.format(ii)] = set([])
        for uu in place_savers['cur_user_list']:
            cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu))
            q = cur.fetchone()
            if q is None:
                growth_params['h{}_missing.json'.format(ii)].add(uu)
                continue
            if q[0] is None:
                growth_params['h{}_extendNULL.json'.format(ii)].add(uu)
            elif q[0] is True:
                growth_params['h{}_extendTRUE.json'.format(ii)].add(uu)
            elif q[0] is False:
                growth_params['h{}_extendFALSE.json'.format(ii)].add(uu)
            else:
                print "ERROR in saving growth parameters! Invalid data type..."
                continue
        breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)

        # GET NEXT SET OF USERS
        if ii < (hop_limits['max_hops'] - 1):
            print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS"
            new_um = set([])
            # Add user mentions to next hop
            for jj in place_savers['cur_user_list']:
                cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id = %s) AND (tweets.created_at >= %s AND tweets.created_at <= %s) AND (users.expand_user IS TRUE) AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (jj, tl_start_date, tl_end_date)))
                uids = cur.fetchall()
                for kk in uids:
                    new_um.update(set(kk[0]))
            print "There are {} user mentions from hop {}".format(len(new_um), ii)
            # Get user mention profiles
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1)
            # Expand, or not, user mentions
            json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);")
            # Expand remaining user mentions
            growth_params['h{}_um_missing.json'.format(ii)] = set([])
            growth_params['h{}_um_extendTRUE.json'.format(ii)] = set([])
            growth_params['h{}_um_extendFALSE.json'.format(ii)] = set([])
            growth_params['h{}_um_extendNULL.json'.format(ii)] = set([])
            new_um_tracker = set(new_um)
            for uu in new_um_tracker:
                cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu))
                q = cur.fetchone()
                if q is None:
                    new_um.remove(uu)
                    growth_params['h{}_um_missing.json'.format(ii)].add(uu)
                    continue
                if q[0] is False:
                    new_um.remove(uu)
                    growth_params['h{}_um_extendFALSE.json'.format(ii)].add(uu)
                else:
                    json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (user_id = {}) AND (expand_user IS NULL);".format(uu))
            new_um_tracker = set(new_um)
            for uu in new_um_tracker:
                cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu))
                q = cur.fetchone()
                if q is None:
                    # print "this is strange"
                    new_um.remove(uu)
                    growth_params['h{}_um_missing.json'.format(ii)].add(uu)
                    continue
                if q[0] is True:
                    growth_params['h{}_um_extendTRUE.json'.format(ii)].add(uu)
                if q[0] is None:
                    growth_params['h{}_um_extendNULL.json'.format(ii)].add(uu)
                    new_um.remove(uu)
                    print "This is not supposed to happen!!!"
            del new_um_tracker
            assert (len(growth_params['h{}_um_extendNULL.json'.format(ii)]) < 1), "There are user mentions assigned expand_user=NULL!"
            place_savers['next_user_list'].update(new_um)
            breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
            breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)

            print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS"
            print "Collect friends"
            for jj in place_savers['cur_user_list']:
                cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (friends_count > 0) AND (friends_list IS NULL);".format(jj))
                q = cur.fetchone()
                if q is None:
                    continue
                if q[0] is True:
                    print "\tCollect friends for user {}.".format(jj)
                    friends_list = pyTweet.get_user_friends(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000)
                    json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, jj)))
            print "Collect followers"
            for jj in place_savers['cur_user_list']:
                cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (followers_count > 0) AND (followers_list IS NULL);".format(jj))
                q = cur.fetchone()
                if q is None:
                    continue
                if q[0] is True:
                    print "\tCollect followers for user {}.".format(jj)
                    followers_list = pyTweet.get_user_followers(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000)
                    json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, jj)))
            print "Get profiles and timelines of friends and followers"
            fids = set([])
            for jj in place_savers['cur_user_list']:
                cur.execute("SELECT friends_list,followers_list FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (((friends_list IS NOT NULL) AND (ARRAY_LENGTH(friends_list,1) > 0)) OR ((followers_list IS NOT NULL) AND (ARRAY_LENGTH(followers_list,1) > 0)));".format(jj))
                flist = cur.fetchone()
                if flist is None:
                    continue
                if flist[0] is not None:
                    fids.update(flist[0])
                if flist[1] is not None:
                    fids.update(flist[1])
            print "There are {} friends/followers of hop {}".format(len(fids), ii)
            # Get profiles of friends/followers
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=fids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1)
            # Filter with high degree rule and get timelines
            json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);")
            # Remove expand_user=FALSE from friend/follower list
            growth_params['h{}_frfo_missing.json'.format(ii)] = set([])
            growth_params['h{}_frfo_extendFALSE.json'.format(ii)] = set([])
            jj_list = list(fids)
            for jj in jj_list:
                cur.execute("SELECT expand_user,has_timeline FROM users WHERE user_id = {};".format(jj))
                q = cur.fetchone()
                if q is None:
                    growth_params['h{}_frfo_missing.json'.format(ii)].add(jj)
                    fids.remove(jj)
                    continue
                if q[0] is False:
                    fids.remove(jj)
                    growth_params['h{}_frfo_extendFALSE.json'.format(ii)].add(jj)
                    continue
                if (q[0] is not False) and (q[1] is None):
                    _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH)
            del jj_list
            # Find the most similar friends/followers, and expand the top 5%
            original_frfo_set = candid_tfidf.find_most_similar_followers(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, user_ids=fids, prev_users=place_savers['cur_user_list'])
            growth_params['h{}_frfo_extendTRUE.json'.format(ii)] = set(fids)
            growth_params['h{}_frfo_extendNULL.json'.format(ii)] = set(original_frfo_set.difference(fids))
            del original_frfo_set
            breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)
            place_savers['next_user_list'].update(fids)
            breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)

            if ii > 0:
                print "\nFIND USERS WITH AT LEAST ONE HASHTAG IN COMMON WITH TOPICS"
                new_relevant_users = _find_relevant_users(cur=cur, conn=conn, user_ids=growth_params['h{}_frfo_extendNULL.json'.format(ii)])
                growth_params['h{}_relevant_extendTRUE.json'.format(ii)] = set(new_relevant_users)
                breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)
                place_savers['next_user_list'].update(new_relevant_users)
                breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)

        # PREPARE FOR NEXT HOP
        place_savers['cur_hop'] += 1
        place_savers['cur_user_list'] = set(place_savers['next_user_list'])
        place_savers['next_user_list'] = set([])
        breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
Example #4
0
def breadth_first_search(user_seed,
                         timeline_start_date,
                         host,
                         port,
                         save_dir={},
                         hop_limits={}):
    """
    This function creates a network based on Twitter friends

    :param user_seed: List of user names
    :param host: Your host IP
    :param port: Your port
    :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection
    :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory.
                          EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons',
                                          'twitter_timelines': '/dir/to/save/timeline/jsons'}

    :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum
                          number of hops to make the graph with 'max_hops', then decide the maximum amount of data to
                          collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to
                          'None' if you don't want to limit the amount of data collected. Next, set limits (per
                          individual) on how many friends, followers, replied to users, and mentioned users to include
                          in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you
                          do not wish to limit the collection, and will expand the graph on as many as these edges as
                          possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note
                          that friends and followers will be saved in the fields 'friends_list' and 'followers_list'
                          automatically. The reply and mention users are saved in timelines.
                          EX.hop_limits = {'max_hops': 2,              # Maximin number of hops in graph
                                                'max_data': None,           # Maximum amount of data (in GB)
                                                'friends': 0,               # Maximum friends per user to include in next hop
                                                'followers': None,          # Maximum followers per user to include in next hop
                                                'in_reply_to_user_id': 17,  # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop
                                                'user_mention_id': 21}      # Maximum 'user_mention_id' per user's timeline to include in next hop
    """
    # CHECK PARAMETERS
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ('twitter_profiles'
            not in save_dir.keys()) or (save_dir['twitter_profiles'].strip()
                                        == ''):
        save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles')
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(
            save_dir['twitter_profiles'])
    if not os.path.isdir(save_dir['twitter_profiles']):
        print "\tThe directory {} does not exist...creating it now".format(
            save_dir['twitter_profiles'])
        os.mkdir(save_dir['twitter_profiles'])
    if ('twitter_timelines'
            not in save_dir.keys()) or (save_dir['twitter_timelines'].strip()
                                        == ''):
        save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines')
        print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(
            save_dir['twitter_timelines'])
    if not os.path.isdir(save_dir['twitter_timelines']):
        print "\tThe directory {} does not exist...creating it now".format(
            save_dir['twitter_timelines'])
        os.mkdir(save_dir['twitter_timelines'])
    # Checkhop_limits dictionary
    hop_limits_defaults = {
        'max_hops': [6, 'the maximin number of hops in graph'],
        'max_data': [2, 'the maximin amount of data collected (in GB)'],
        'friends': [0, 'max friends per user to include in next hop'],
        'followers': [0, 'max followers per user to include in next hop'],
        'in_reply_to_user_id': [
            0,
            "max 'in_reply_to_user_id' per user's timeline to include in next hop"
        ],
        'user_mention_id':
        [0, "max 'user_mention_id' per user's timeline to include in next ho"]
    }
    for kk in hop_limits_defaults.keys():
        if kk not in hop_limits:
            hop_limits[kk] = hop_limits_defaults[kk][0]
            print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format(
                kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0])
    # Check data amount and quit if graph has reached limit
    if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None):
        data_vol = measure_data(user_dir=save_dir['twitter_profiles'],
                                timeline_dir=save_dir['twitter_timelines'])
        if (data_vol > hop_limits['max_data']):
            print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                data_vol, hop_limits['max_data'])
            return

    # DETERMINE COLLECTION PARAMETERS
    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = load_place_savers(save_dir['twitter_profiles'])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(
        len(place_savers['finished_users']), save_dir['twitter_profiles'])
    print "\tThe current hop is {}".format(place_savers['cur_hop'])
    if place_savers['cur_hop'] < 1:
        place_savers['cur_user_list'] = set(user_seed)
    save_place_savers(user_dir=save_dir['twitter_profiles'],
                      place_saver_obj=place_savers)
    print "\tWe will collect {} users in hop {}".format(
        len(place_savers['cur_user_list']), place_savers['cur_hop'])
    # Load growth parametes
    growth_params = load_growth_params(save_dir['twitter_profiles'])
    # Create proxies dictionary
    proxies = {
        'http': 'http://%s:%s' % (host, port),
        'https': 'http://%s:%s' % (host, port)
    }
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()
    # API authorization
    auth = pyTweet.get_authorization(twitter_keys)

    # BUILD THE GRAPH
    print "\nStart building the graph!"
    for khop in range(place_savers['cur_hop'], hop_limits['max_hops']):
        print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(
            khop, len(place_savers['cur_user_list']))
        print "Create the user list of the {}th-hop users as well.".format(
            khop + 1)

        # Get profile information of users in cur_user_list
        print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS"
        if khop < 1:
            # Find profiles to collect
            profiles_to_collect = set(place_savers['cur_user_list'])
            for json_filename in place_savers['finished_users'].values():
                data = ujson.load(
                    open(
                        os.path.join(save_dir['twitter_profiles'],
                                     json_filename), 'r'))
                if data['screen_name'] in place_savers['cur_user_list']:
                    profiles_to_collect.discard(data['screen_name'])
            # Collect and save profiles
            user_info = []
            if len(profiles_to_collect) > 0:
                print "\nstart collecting profiles: {} profiles".format(
                    len(profiles_to_collect))
                user_info = pyTweet.user_lookup_usernames(
                    user_list=list(profiles_to_collect),
                    proxies=proxies,
                    auth=auth)
                if isinstance(user_info, dict) and ('errors'
                                                    in user_info.keys()):
                    print "\nThe initial seed cannot be collected..."
                    print "Twitter error message: ", user_info
                # Save profile information
                # print "user_info: ", user_info
                # print type(user_info)
                for udata in user_info:
                    # print "udata: ", udata
                    # print type(udata)
                    json_filename = _save_profile_json(
                        profile_struct=udata,
                        save_dir=save_dir['twitter_profiles'],
                        khop=khop)
                    place_savers['finished_users'][udata['id']] = json_filename
            # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well
            all_screennames = {
            }  # Keys are screen names and values are file name
            jsons = filter(lambda k: re.match('userInfo_*', k),
                           os.listdir(save_dir['twitter_profiles']))
            for jj in jsons:
                try:
                    full_filename = os.path.join(save_dir['twitter_profiles'],
                                                 jj)
                    if os.path.getsize(full_filename) != 0:
                        jfid = open(full_filename)
                        profile = ujson.load(jfid)
                        jfid.close()
                        all_screennames[profile['screen_name']] = jj
                except ValueError:
                    continue
            # Get corresponding user IDs for each screen name in cur_user_list
            cur_user_list_ids = set([])
            for scn_name in profiles_to_collect.union(
                    place_savers['cur_user_list']):
                if scn_name in all_screennames.keys():
                    jfid = open(
                        os.path.join(save_dir['twitter_profiles'],
                                     all_screennames[scn_name]))
                    profile = ujson.load(jfid)
                    jfid.close()
                    if 'id' in profile:
                        cur_user_list_ids.add(int(profile['id']))
                    else:
                        place_savers['unavailable_accounts'].add(scn_name)
                else:
                    place_savers['unavailable_accounts'].add(scn_name)
            print cur_user_list_ids
            del profiles_to_collect
            place_savers['cur_user_list'] = set(cur_user_list_ids)
        else:
            # Collect and save profiles
            profiles_to_collect = set(
                place_savers['cur_user_list']).difference(
                    set(map(int, place_savers['finished_users'].keys())))
            user_info = pyTweet.user_lookup_userids(
                user_list=list(profiles_to_collect),
                proxies=proxies,
                auth=auth)
            for udata in user_info:
                json_filename = _save_profile_json(
                    profile_struct=udata,
                    save_dir=save_dir['twitter_profiles'],
                    khop=khop)
                place_savers['finished_users'][udata['id']] = json_filename
            # Update current user list, and identify unavailable accounts
            new_cur_user_list = set([])
            for uid in profiles_to_collect.union(
                    set(place_savers['cur_user_list'])):
                if uid in place_savers['unavailable_accounts']:
                    continue
                if uid in place_savers['finished_users'].keys():
                    new_cur_user_list.add(uid)
                else:
                    place_savers['unavailable_accounts'].add(uid)
            place_savers['cur_user_list'] = set(new_cur_user_list)
            del new_cur_user_list

        # Save place saving variables
        growth_params['h{}_users.json'.format(khop)] = set(
            place_savers['cur_user_list'])
        save_growth_params(user_dir=save_dir['twitter_profiles'],
                           growth_obj=growth_params,
                           cur_hop=khop)
        save_place_savers(user_dir=save_dir['twitter_profiles'],
                          place_saver_obj=place_savers)

        # Get timelines for each user in user_info
        print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS"
        for uid in place_savers['cur_user_list']:
            if uid in place_savers['finished_users'].keys():
                profile_filename = place_savers['finished_users'][uid]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(
                    save_dir['twitter_timelines'],
                    "timeline_{}.json".format(uuid_profile))
                try:
                    tldata = ujson.load(open(profile_filename, 'r'))
                    tldata['id']
                except (IOError, KeyError):
                    # Collect user data
                    user_info = pyTweet.user_lookup_userids(user_list=[uid],
                                                            proxies=proxies,
                                                            auth=auth)
                    if (user_info is not dict) or ('id'
                                                   not in user_info.keys()):
                        continue
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0],
                        save_dir=save_dir['twitter_profiles'],
                        khop=khop)
                    place_savers['finished_users'][uid] = json_filename
                if ('has_timeline'
                        in tldata.keys()) and (tldata['has_timeline'] is True):
                    continue
                if not os.path.isfile(timeline_filename):
                    print "Collect the timeline for user {}.".format(uid)
                    tldata = pyTweet.collect_user_timeline(
                        USER=uid,
                        USER_type='user_id',
                        start_date=timeline_start_date,
                        proxies=proxies,
                        auth=auth)
                    for tl in range(len(tldata)):
                        tldata[tl]['DOC'] = datetime.datetime.utcnow(
                        ).strftime("%m-%d-%Y %H:%M:%S %z")
                        tldata[tl]['has_timeline'] = True
                    fast_save(filename=profile_filename, obj=tldata)

        print "\nGet friends of each user in cur_user_list"
        if hop_limits['friends'] != 0:
            growth_params["h{}_friends.json".format(khop)] = set([])
            print "\nCOLLECT FRIENDS OF CURRENT USER SET"
            # print "place_savers['cur_user_list']: ", place_savers['cur_user_list']
            for jj in place_savers['cur_user_list']:
                profile_filename = os.path.join(
                    save_dir['twitter_profiles'],
                    place_savers['finished_users'][jj])
                try:
                    data = ujson.load(open(profile_filename, 'r'))
                    # print data['id']
                except (IOError, KeyError, TypeError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid],
                                                            proxies=proxies,
                                                            auth=auth)
                    if (user_info is not dict) or ('id'
                                                   not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0],
                                       save_dir=save_dir['twitter_profiles'],
                                       khop=khop)
                    json_filename = _save_profile_json(
                        profile_struct=user_info[0],
                        save_dir=save_dir['twitter_profiles'],
                        khop=khop)
                    place_savers['finished_users'][uid] = json_filename
                if data['friends_count'] < 1:
                    data['friends_list'] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if 'friends_list' not in data.keys():
                    print "Collect friends for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj,
                        limit=hop_limits['friends'],
                        proxies=proxies,
                        auth=auth)
                    data['friends_list'] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits['friends'] < len(data['friends_list']):
                    place_savers['next_user_list'].update(
                        set(data['friends_list'][0:len(hop_limits['friends']) -
                                                 1]))
                else:
                    place_savers['next_user_list'].update(
                        set(data['friends_list']))
                growth_params["h{}_friends.json".format(khop)].update(
                    set(data['friends_list']))
                save_place_savers(user_dir=save_dir['twitter_profiles'],
                                  place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir['twitter_profiles'],
                               growth_obj=growth_params,
                               cur_hop=khop)

        print "\nGet followers of each user in the cur_user_list"
        if hop_limits['followers'] != 0:
            growth_params["h{}_followers.json".format(khop)] = set([])
            print "\nCOLLECT FOLLOWERS OF CURRENT USER SET"
            for jj in place_savers['cur_user_list']:
                # profile_filename = place_savers['finished_users'][jj]
                profile_filename = os.path.join(
                    save_dir['twitter_profiles'],
                    place_savers['finished_users'][jj])
                try:
                    data = ujson.load(open(profile_filename, 'r'))
                    data['id']
                except (IOError, KeyError):
                    user_info = pyTweet.user_lookup_userids(user_list=[uid],
                                                            proxies=proxies,
                                                            auth=auth)
                    if (user_info is not dict) or ('id'
                                                   not in user_info.keys()):
                        continue
                    _save_profile_json(profile_struct=user_info[0],
                                       save_dir=save_dir['twitter_profiles'],
                                       khop=khop)
                if data['followers_count'] < 1:
                    data['followers_list'] = []
                    fast_save(filename=profile_filename, obj=data)
                    continue
                if 'followers_list' not in data.keys():
                    print "Collect followers for user {}".format(jj)
                    friends_list = pyTweet.get_user_friends(
                        user_id=jj,
                        limit=hop_limits['followers'],
                        proxies=proxies,
                        auth=auth)
                    data['followers_list'] = friends_list
                    fast_save(filename=profile_filename, obj=data)
                if hop_limits['followers'] < len(data['followers_list']):
                    place_savers['next_user_list'].update(
                        set(data['followers_list']
                            [0:len(hop_limits['followers']) - 1]))
                else:
                    place_savers['next_user_list'].update(
                        set(data['followers_list']))
                growth_params["h{}_followers.json".format(khop)].update(
                    set(data['followers_list']))
                save_place_savers(user_dir=save_dir['twitter_profiles'],
                                  place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir['twitter_profiles'],
                               growth_obj=growth_params,
                               cur_hop=khop)

        # Pull out user mentions
        if ('user_mention_id'
                in hop_limits) and (hop_limits['user_mention_id'] != 0):
            print "\nCOLLECT USER MENTIONS OF CURRENT SET"
            growth_params["h{}_user_mentions.json".format(khop)] = set([])
            for jj in place_savers['cur_user_list']:
                profile_filename = place_savers['finished_users'][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(
                    save_dir['twitter_timelines'],
                    "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, 'r'))
                        if len(tldata) < 1:
                            continue
                        tldata[0]['text']
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(user_id=jj,
                                            filename=timeline_filename,
                                            start_date=timeline_start_date,
                                            proxies=proxies,
                                            auth=auth)
                        tldata = ujson.load(open(timeline_filename, 'r'))
                else:
                    # Get the timeline data
                    _save_timeline_json(user_id=jj,
                                        filename=timeline_filename,
                                        start_date=timeline_start_date,
                                        proxies=proxies,
                                        auth=auth)
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, 'r'))
                    if len(tldata) < 1:
                        continue
                # Pull out user mentions
                tl_mentions = pyTweet.pull_timeline_entitites(
                    timeline=tldata,
                    type='user_mention_id',
                    limit=hop_limits['user_mention_id'])
                growth_params["h{}_user_mentions.json".format(khop)].update(
                    tl_mentions)
                place_savers['next_user_list'].update(tl_mentions)
                save_place_savers(user_dir=save_dir['twitter_profiles'],
                                  place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir['twitter_profiles'],
                               growth_obj=growth_params,
                               cur_hop=khop)

        # Pull out replies
        if ('in_reply_to_user_id'
                in hop_limits) and (hop_limits['in_reply_to_user_id'] != 0):
            print "\nCOLLECT USERS CURRENT SET REPLIES TO"
            growth_params["h{}_replies.json".format(khop)] = set([])
            for jj in place_savers['cur_user_list']:
                profile_filename = place_savers['finished_users'][jj]
                uuid_profile = os.path.basename(profile_filename)[9:-5]
                timeline_filename = os.path.join(
                    save_dir['twitter_timelines'],
                    "timeline_{}.json".format(uuid_profile))
                # Load or create the timeline JSON file
                if os.path.isfile(timeline_filename):
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    # Load the timeline data
                    try:
                        tldata = ujson.load(open(timeline_filename, 'r'))
                        tldata[0]['text']
                    except (IOError, KeyError):
                        # Fix timeline file
                        _save_timeline_json(user_id=jj,
                                            filename=timeline_filename,
                                            start_date=timeline_start_date,
                                            proxies=proxies,
                                            auth=auth)
                        tldata = ujson.load(open(timeline_filename, 'r'))
                    if len(tldata) < 1:
                        continue
                else:
                    # Get the timeline data
                    _save_timeline_json(user_id=jj,
                                        filename=timeline_filename,
                                        start_date=timeline_start_date,
                                        proxies=proxies,
                                        auth=auth)
                    if os.path.getsize(timeline_filename) == 0:
                        continue
                    tldata = ujson.load(open(timeline_filename, 'r'))
                    if len(tldata) < 1:
                        continue
                # Pull out replies
                tl_replies = pyTweet.pull_timeline_entitites(
                    timeline=tldata,
                    type='in_reply_to_user_id',
                    limit=hop_limits['in_reply_to_user_id'])
                place_savers['next_user_list'].update(tl_replies)
                growth_params["h{}_replies.json".format(khop)].update(
                    tl_replies)
                save_place_savers(user_dir=save_dir['twitter_profiles'],
                                  place_saver_obj=place_savers)
            save_growth_params(user_dir=save_dir['twitter_profiles'],
                               growth_obj=growth_params,
                               cur_hop=khop)

        # Check data limit
        if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None):
            data_vol = measure_data(user_dir=save_dir['twitter_profiles'],
                                    timeline_dir=save_dir['twitter_timelines'])
            if (data_vol > hop_limits['max_data']):
                print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(
                    data_vol, hop_limits['max_data'])
                return
        # Prepare for next iteration
        place_savers['cur_hop'] = khop + 1
        place_savers['cur_user_list'] = set(place_savers['next_user_list'])
        place_savers['next_user_list'] = set([])
        save_place_savers(user_dir=save_dir['twitter_profiles'],
                          place_saver_obj=place_savers)
Example #5
0
def depth_first_causal_search(user_seed, topic_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}, collection_limits={}):
    """
    This funciton builds a network based on users relevant to seed keywords
    Requires that a PostgreSQL database already exist

    :param user_seed: List of user names
    :param topic_seed: List of seed topics
    :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection
    :param tl_end_date: End date (datetime.date object) of timelines in collection
    :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database
    :param host:
    :param port:
    :param save_dir: Directory to save sampling and growth parameters
    :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to
                       make a graph with 'max_hops'.
                          EX. hop_limits = {'max_hops': 2}              # Maximum number of hops in graph
    :param collection_limits: Specify the term-frequency calculation and threshold percentile
                    EX. collection_limits = {'threshold_percentile': 0.05,  # Threshold percentile for ....
                                         'tf_type': 'raw'}     # TF caclulation type
    """
    # CHECK PARAMETERS
    print "\nCheck parameters"
    # Timeline start and end dates
    assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))."
    assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date."
    # Check PostgreSQL parameters
    assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'."
    try:
        conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values())))
        cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    except psycopg2.OperationalError:
        print "OperationalError: Check your login credentials.  Make sure the database exists as well."
        return
    # Check hop_limits dictionary
    if 'max_hops' not in hop_limits:
        hop_limits['max_hops'] = 5
        print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops'])
    # Check save_dir dictionary fields, create directories if they do not already exist
    if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''):
        save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles')
        print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles'])
    if not os.path.isdir(save_dir['twitter_profiles']):
        print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles'])
        os.mkdir(save_dir['twitter_profiles'])
    # Check collection_limits dictionary
    if 'threshold_percentile' not in collection_limits:
        collection_limits['threshold_percentile'] = 0.05
        print "\tNo value was specified for collection_limits['threshold_percentile'], xxx, so it will be set to 0.05."
    assert (0 <= collection_limits['threshold_percentile'] <= 1), "The value collection_parameters['threshold_percentile'] must fall within [0,1]."
    if 'tf_type' not in collection_limits:
        collection_limits['tf_type'] = 'raw'
        print "\tNo value was specified for collection_limits['tf_type'], method of calculating the term frequency, so it will be set to 'raw'."
    assert ((collection_limits['tf_type'] == 'raw') or (collection_limits['tf_type'] == 'augmented') or (collection_limits['tf_type'] == 'boolean')), "The value collection_parameters['tf_type'] is not recognized. Please enter 'raw', 'boolean' or 'augmented' as it's value."

    # SET UP SECONDARY PARAMETERS
    # Create proxies dictionary
    proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)}
    # Load twitter keys
    twitter_keys = pyTweet.load_twitter_api_key_set()

    # Load place_savers dictionary
    print "\nGetting information of current hop and finished users..."
    place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles'])
    print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles'])
    print "\tThe current hop is {}".format(place_savers['cur_hop'])
    if place_savers['cur_hop'] < 1:
        place_savers['cur_user_list'] = set(user_seed)
    print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop'])

    # Load growth parameters
    growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles'])

    # API AUTHORIZATION
    print "\nAPI Authorization"
    OAUTH = pyTweet.get_authorization(twitter_keys)
    print "Start with key {}".format(OAUTH['KEY_FILE'])

    # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS
    print "\nConfigure database for TF-IDF analysis"
    json_to_database.configure_database_to_build_network(cur, conn)
    # Load topics
    for t in topic_seed:
        if (t is None) or (t.strip() == ''):
            continue
        json_to_database.make_sql_edit(cur, conn, "INSERT INTO topics (topic, khop) VALUES ('{}', -1);".format(t.strip()))
    # Add columns for this sampling method
    new_columns = [{'table': 'users', 'col': 'has_timeline_filter', 'type': 'BOOLEAN'},     # Indicates if a user's timeline has already been filtered
                   {'table': 'users', 'col': 'timeline_document', 'type': 'TEXT[]'},        # Document created from relevant tweets
                   {'table': 'topics', 'col': 'document_frequency', 'type': 'FLOAT'},       # Docuemnt frequency
                   {'table': 'users', 'col': 'decision_candid_tfdf_score', 'type': 'FLOAT'}]
    for i in new_columns:
        try:
            json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type']))
            print "Add column {} to table {}.".format(i['col'], i['table'])
        except psycopg2.ProgrammingError:
            conn.rollback()
    new_ind = [{'table': 'users', 'col': 'has_timeline_filter'}]

    # SAMPLING LOOP
    print "\nBegin collection"
    cur_hop = place_savers['cur_hop']
    for ii in range(cur_hop, hop_limits['max_hops']):
        print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list']))
        # GET PROFILE INFORMATION
        if ii < 1:
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii)
        else:
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii)
        growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list'])
        breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)

        # GET TIMELINES
        cur.execute("SELECT user_id FROM users WHERE (has_timeline IS NULL) AND (expand_user = TRUE OR expand_user IS NULL) AND (khop = {});".format(ii))
        uids = cur.fetchall()
        for j in uids:
            # Get timeline
            _get_timeline_wrapper(cur=cur, conn=conn, user_id=j[0], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH)
        # Filter users by timeline
        _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii)
        # Create documents from timelines
        candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date)
        # Expand relevant seed users
        json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE khop = 0 AND expand_user IS NULL;")

        # GET NEXT SET OF USERS
        if ii < (hop_limits['max_hops'] - 1):
            # USER MENTIONS
            print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS"
            new_um = set([])
            cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (tweets.created_at >= %s AND tweets.created_at <= %s) AND users.expand_user = TRUE AND users.khop = %s AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (tl_start_date, tl_end_date, ii)))
            uids = cur.fetchall()
            for t in uids:
                new_um = new_um.union(set(t[0]))
            # Get user mention profiles
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1)
            # Get timelines
            for um in new_um:
                _get_timeline_wrapper(cur=cur, conn=conn, user_id=um, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH)
            # Filter users by timeline
            _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1)
            # Create documents from timelines
            candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date)
            # User mentions who have expand_user = NULL, will be set to TRUE
            for um in new_um:
                json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE expand_user IS NULL AND user_id = {};".format(um))
            growth_params['h{}_user_mentions.json'.format(ii)] = set(new_um)
            breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)
            del new_um

            # FRIENDS AND FOLLOWERS
            print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS"
            print "Collect friends"
            cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND friends_count > 0 AND friends_list IS NOT NULL;".format(ii))
            hasfriends = cur.fetchall()
            growth_params['h{}_friends.json'.format(ii)] = set([])
            for u in hasfriends:
                print "\nCollect friends for user {}.".format(u[0])
                friends_list = pyTweet.get_user_friends(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100)
                json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, u[0])))
                growth_params['h{}_friends.json'.format(ii)].update(set(friends_list))
            breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)
            print "Collect followers"
            cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND followers_count > 0 AND followers_list IS NOT NULL;".format(ii))
            hasfollowers = cur.fetchall()
            growth_params['h{}_followers.json'] = set([])
            for u in hasfollowers:
                print "\nCollect followers for user {}.".format(u[0])
                followers_list = pyTweet.get_user_followers(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100)
                json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, u[0])))
                growth_params['h{}_followers.json'].update(set(followers_list))
            breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii)
            print "Get profiles and timelines of friends and followers"
            cur.execute("SELECT user_id, friends_list,followers_list FROM users WHERE expand_user = TRUE AND khop = {} AND (ARRAY_LENGTH(friends_list, 1) > 0 OR ARRAY_LENGTH(followers_list, 1) > 0);".format(ii))
            flist = cur.fetchall()
            ids = set([])
            for f in flist:
                if f is not None:
                    if f[1] is not None:
                        ids.update(f[1])
                    if f[2] is not None:
                        ids.update(f[2])
            ids = list(ids)
            # Get profiles of friends/followers
            _get_profiles_wrapper(cur=cur, conn=conn, user_list=ids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1)
            for i in range(len(ids)):
                print "\nGet timeline for friend/follower {}: {} out of {}".format(ids[i], i, len(ids))
                _get_timeline_wrapper(cur=cur, conn=conn, user_id=ids[i], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH)
            # Filter profiles by timeline
            _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1)
            # Create documents from timelines
            candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date)
            # Compute CANDID information score, and discriminate users
            for f in flist:
                if (f is not None) and (f[0] is not None):
                    candid_tfidf.compute_candid_score(cur=cur, conn=conn, parent_id=f[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, threshold_percentile=collection_limits['threshold_percentile'], tf_type=collection_limits['tf_type'])

        # PREPARE FOR NEXT HOP
        place_savers['cur_hop'] = ii + 1
        place_savers['cur_user_list'] = set([])
        cur.execute("SELECT user_id FROM users WHERE khop = {} AND expand_user = TRUE;".format(ii + 1))
        new_profiles = cur.fetchall()
        for np in new_profiles:
            place_savers['cur_user_list'].add(np[0])
        breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)