def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}): """ This function creates a network based on Twitter friends :param user_seed: List of user names :param host: Your host IP :param port: Your port :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX.hop_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ("twitter_profiles" not in save_dir.keys()) or (save_dir["twitter_profiles"].strip() == ""): save_dir["twitter_profiles"] = os.path.join(os.getcwd(), "profiles") print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format( save_dir["twitter_profiles"] ) if not os.path.isdir(save_dir["twitter_profiles"]): print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_profiles"]) os.mkdir(save_dir["twitter_profiles"]) if ("twitter_timelines" not in save_dir.keys()) or (save_dir["twitter_timelines"].strip() == ""): save_dir["twitter_timelines"] = os.path.join(os.getcwd(), "timelines") print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format( save_dir["twitter_timelines"] ) if not os.path.isdir(save_dir["twitter_timelines"]): print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_timelines"]) os.mkdir(save_dir["twitter_timelines"]) # Checkhop_limits dictionary hop_limits_defaults = { "max_hops": [6, "the maximin number of hops in graph"], "max_data": [2, "the maximin amount of data collected (in GB)"], "friends": [0, "max friends per user to include in next hop"], "followers": [0, "max followers per user to include in next hop"], "in_reply_to_user_id": [0, "max 'in_reply_to_user_id' per user's timeline to include in next hop"], "user_mention_id": [0, "max 'user_mention_id' per user's timeline to include in next ho"], } for kk in hop_limits_defaults.keys(): if kk not in hop_limits: hop_limits[kk] = hop_limits_defaults[kk][0] print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format( kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0] ) # Check data amount and quit if graph has reached limit if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None): data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"]) if data_vol > hop_limits["max_data"]: print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits["max_data"] ) return # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir["twitter_profiles"]) print "\tAs of now {} user profiles have been collected and saved to {}".format( len(place_savers["finished_users"]), save_dir["twitter_profiles"] ) print "\tThe current hop is {}".format(place_savers["cur_hop"]) if place_savers["cur_hop"] < 1: place_savers["cur_user_list"] = set(user_seed) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) print "\tWe will collect {} users in hop {}".format(len(place_savers["cur_user_list"]), place_savers["cur_hop"]) # Load growth parametes growth_params = load_growth_params(save_dir["twitter_profiles"]) # Create proxies dictionary proxies = {"http": "http://%s:%s" % (host, port), "https": "http://%s:%s" % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API authorization auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for khop in range(place_savers["cur_hop"], hop_limits["max_hops"]): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format( khop, len(place_savers["cur_user_list"]) ) print "Create the user list of the {}th-hop users as well.".format(khop + 1) # Get profile information of users in cur_user_list print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS" if khop < 1: # Find profiles to collect profiles_to_collect = set(place_savers["cur_user_list"]) for json_filename in place_savers["finished_users"].values(): data = ujson.load(open(os.path.join(save_dir["twitter_profiles"], json_filename), "r")) if data["screen_name"] in place_savers["cur_user_list"]: profiles_to_collect.discard(data["screen_name"]) # Collect and save profiles user_info = [] if len(profiles_to_collect) > 0: print "\nstart collecting profiles: {} profiles".format(len(profiles_to_collect)) user_info = pyTweet.user_lookup_usernames( user_list=list(profiles_to_collect), proxies=proxies, auth=auth ) if isinstance(user_info, dict) and ("errors" in user_info.keys()): print "\nThe initial seed cannot be collected..." print "Twitter error message: ", user_info # Save profile information # print "user_info: ", user_info # print type(user_info) for udata in user_info: # print "udata: ", udata # print type(udata) json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][udata["id"]] = json_filename # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well all_screennames = {} # Keys are screen names and values are file name jsons = filter(lambda k: re.match("userInfo_*", k), os.listdir(save_dir["twitter_profiles"])) for jj in jsons: try: full_filename = os.path.join(save_dir["twitter_profiles"], jj) if os.path.getsize(full_filename) != 0: jfid = open(full_filename) profile = ujson.load(jfid) jfid.close() all_screennames[profile["screen_name"]] = jj except ValueError: continue # Get corresponding user IDs for each screen name in cur_user_list cur_user_list_ids = set([]) for scn_name in profiles_to_collect.union(place_savers["cur_user_list"]): if scn_name in all_screennames.keys(): jfid = open(os.path.join(save_dir["twitter_profiles"], all_screennames[scn_name])) profile = ujson.load(jfid) jfid.close() if "id" in profile: cur_user_list_ids.add(int(profile["id"])) else: place_savers["unavailable_accounts"].add(scn_name) else: place_savers["unavailable_accounts"].add(scn_name) print cur_user_list_ids del profiles_to_collect place_savers["cur_user_list"] = set(cur_user_list_ids) else: # Collect and save profiles profiles_to_collect = set(place_savers["cur_user_list"]).difference( set(map(int, place_savers["finished_users"].keys())) ) user_info = pyTweet.user_lookup_userids(user_list=list(profiles_to_collect), proxies=proxies, auth=auth) for udata in user_info: json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][udata["id"]] = json_filename # Update current user list, and identify unavailable accounts new_cur_user_list = set([]) for uid in profiles_to_collect.union(set(place_savers["cur_user_list"])): if uid in place_savers["unavailable_accounts"]: continue if uid in place_savers["finished_users"].keys(): new_cur_user_list.add(uid) else: place_savers["unavailable_accounts"].add(uid) place_savers["cur_user_list"] = set(new_cur_user_list) del new_cur_user_list # Save place saving variables growth_params["h{}_users.json".format(khop)] = set(place_savers["cur_user_list"]) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) # Get timelines for each user in user_info print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS" for uid in place_savers["cur_user_list"]: if uid in place_savers["finished_users"].keys(): profile_filename = place_savers["finished_users"][uid] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) try: tldata = ujson.load(open(profile_filename, "r")) tldata["id"] except (IOError, KeyError): # Collect user data user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][uid] = json_filename if ("has_timeline" in tldata.keys()) and (tldata["has_timeline"] is True): continue if not os.path.isfile(timeline_filename): print "Collect the timeline for user {}.".format(uid) tldata = pyTweet.collect_user_timeline( USER=uid, USER_type="user_id", start_date=timeline_start_date, proxies=proxies, auth=auth ) for tl in range(len(tldata)): tldata[tl]["DOC"] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z") tldata[tl]["has_timeline"] = True fast_save(filename=profile_filename, obj=tldata) print "\nGet friends of each user in cur_user_list" if hop_limits["friends"] != 0: growth_params["h{}_friends.json".format(khop)] = set([]) print "\nCOLLECT FRIENDS OF CURRENT USER SET" # print "place_savers['cur_user_list']: ", place_savers['cur_user_list'] for jj in place_savers["cur_user_list"]: profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj]) try: data = ujson.load(open(profile_filename, "r")) # print data['id'] except (IOError, KeyError, TypeError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop) json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][uid] = json_filename if data["friends_count"] < 1: data["friends_list"] = [] fast_save(filename=profile_filename, obj=data) continue if "friends_list" not in data.keys(): print "Collect friends for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits["friends"], proxies=proxies, auth=auth ) data["friends_list"] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits["friends"] < len(data["friends_list"]): place_savers["next_user_list"].update(set(data["friends_list"][0 : len(hop_limits["friends"]) - 1])) else: place_savers["next_user_list"].update(set(data["friends_list"])) growth_params["h{}_friends.json".format(khop)].update(set(data["friends_list"])) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) print "\nGet followers of each user in the cur_user_list" if hop_limits["followers"] != 0: growth_params["h{}_followers.json".format(khop)] = set([]) print "\nCOLLECT FOLLOWERS OF CURRENT USER SET" for jj in place_savers["cur_user_list"]: # profile_filename = place_savers['finished_users'][jj] profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj]) try: data = ujson.load(open(profile_filename, "r")) data["id"] except (IOError, KeyError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop) if data["followers_count"] < 1: data["followers_list"] = [] fast_save(filename=profile_filename, obj=data) continue if "followers_list" not in data.keys(): print "Collect followers for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits["followers"], proxies=proxies, auth=auth ) data["followers_list"] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits["followers"] < len(data["followers_list"]): place_savers["next_user_list"].update( set(data["followers_list"][0 : len(hop_limits["followers"]) - 1]) ) else: place_savers["next_user_list"].update(set(data["followers_list"])) growth_params["h{}_followers.json".format(khop)].update(set(data["followers_list"])) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Pull out user mentions if ("user_mention_id" in hop_limits) and (hop_limits["user_mention_id"] != 0): print "\nCOLLECT USER MENTIONS OF CURRENT SET" growth_params["h{}_user_mentions.json".format(khop)] = set([]) for jj in place_savers["cur_user_list"]: profile_filename = place_savers["finished_users"][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue tldata[0]["text"] except (IOError, KeyError): # Fix timeline file _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) tldata = ujson.load(open(timeline_filename, "r")) else: # Get the timeline data _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue # Pull out user mentions tl_mentions = pyTweet.pull_timeline_entitites( timeline=tldata, type="user_mention_id", limit=hop_limits["user_mention_id"] ) growth_params["h{}_user_mentions.json".format(khop)].update(tl_mentions) place_savers["next_user_list"].update(tl_mentions) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Pull out replies if ("in_reply_to_user_id" in hop_limits) and (hop_limits["in_reply_to_user_id"] != 0): print "\nCOLLECT USERS CURRENT SET REPLIES TO" growth_params["h{}_replies.json".format(khop)] = set([]) for jj in place_savers["cur_user_list"]: profile_filename = place_savers["finished_users"][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, "r")) tldata[0]["text"] except (IOError, KeyError): # Fix timeline file _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue else: # Get the timeline data _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue # Pull out replies tl_replies = pyTweet.pull_timeline_entitites( timeline=tldata, type="in_reply_to_user_id", limit=hop_limits["in_reply_to_user_id"] ) place_savers["next_user_list"].update(tl_replies) growth_params["h{}_replies.json".format(khop)].update(tl_replies) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Check data limit if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None): data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"]) if data_vol > hop_limits["max_data"]: print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits["max_data"] ) return # Prepare for next iteration place_savers["cur_hop"] = khop + 1 place_savers["cur_user_list"] = set(place_savers["next_user_list"]) place_savers["next_user_list"] = set([]) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_out_limits={}, collection_limits={}): """ This function creates a network based on Twitter friends @param user_seed - List of user names @param host - @param port - @param timeline_start_date - Beginning of date (datetime.date object) of timelines in collection @param save_dir - Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} @param hop_limits - Specify your graph constrains with the variable hop_out_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX. hop_out_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop @param collection_limits - Suppose that you want to store friends or followers, but do not want to expand the graph based on them. Specify limitations on collecting friends and followers below. Notice that reply and mention users are saved in the timelines. The largest possible length of 'friends_list' will be the greater of hops out limit and collection limit, or MAX(hops_out_limit['friends'], collection_limits['friends']). The same description follows for 'followers_list'. EX. collection_limits = {'friends': 0, # Maximum number of friends per user to save within the profile .JSON 'followers': None} # Maximum number of followers per user to save within the profile .JSON """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''): save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines') print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(save_dir['twitter_timelines']) if not os.path.isdir(save_dir['twitter_timelines']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_timelines']) os.mkdir(save_dir['twitter_timelines']) # Check data amount and quit if graph has reached limit if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_out_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data']) return # Check hop_out_limits dictionary if 'max_hops' not in hop_out_limits: hop_out_limits['max_hops'] = 6 print "\tNo value was specified for hop_out_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_out_limits['max_hops']) if 'max_data' not in hop_out_limits: hop_out_limits['max_data'] = 2 print "\tNo value was specified for hop_out_limits['max_data'], the maximin amount of data collected (in GB), so it will be set to {}.".format(hop_out_limits['max_data']) if 'friends' not in hop_out_limits: hop_out_limits['friends'] = 0 print "\tNo value was specified for hop_out_limits['friends'], max friends per user to include in next hop, so it will be set to 0." if 'followers' not in hop_out_limits: hop_out_limits['followers'] = 0 print "\tNo value was specified for hop_out_limits['followers'], max followers per user to include in next hop, so it will be set to 0." if 'in_reply_to_user_id' not in hop_out_limits: hop_out_limits['in_reply_to_user_id'] = 0 print "\tNo value was specified for hop_out_limits['in_reply_to_user_id'], max 'in_reply_to_user_id' per user's timeline to include in next hop, so it will be set to 0." if 'user_mention_id' not in hop_out_limits: hop_out_limits['user_mention_id'] = 0 print "\tNo value was specified for hop_out_limits['user_mention_id'], max 'user_mention_id' per user's timeline to include in next hop, so it will be set to 0." # Check collection_limits dictionary if 'friends' not in collection_limits: collection_limits['friends'] = 0 print "\tNo value was specified for collection_limits['friends'], max number of friends per user to save with the profile .JSON, so it will be set to 0." if 'followers' not in collection_limits: collection_limits['followers'] = 0 print "\tNo value was specified for collection_limits['followers'], max number of followers per user to save with the profile .JSON, so it will be set to 0." # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) print "\tSo far we plan to collect {} users in hop {}".format(len(place_savers['next_user_list']), place_savers['cur_hop'] + 1) # Determine limits for friends/followers collection - if None in [hop_out_limits['friends'], collection_limits['friends']]: MAX_FRIENDS = None else: MAX_FRIENDS = max(hop_out_limits['friends'], collection_limits['friends']) if None in [hop_out_limits['followers'], collection_limits['followers']]: MAX_FOLLOWERS = None else: MAX_FOLLOWERS = max(hop_out_limits['followers'], collection_limits['followers']) # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API AUTHORIZATION print "\nAPI Authorization" auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for i in range(place_savers['cur_hop'], hop_out_limits['max_hops']): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(i, len(place_savers['cur_user_list'])) print "Create the user list of the " + str(i+1) + "th-hop users." # Remove finished_users from next_user_list if (place_savers['cur_hop'] > 0): place_savers['cur_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys()))) # Separate list for faster results, and delete place_savers['cur_user_list'] to free space USERS = [list(place_savers['cur_user_list'])[z:z+100] for z in range(0, len(place_savers['cur_user_list']), 100)] del place_savers['cur_user_list'] # save space for j in range(len(USERS)): # Look up information of users, 100 at a time print "\tLook up user information" if i < 1: # The initial list contain user names or @handles user_info = pyTweet.user_lookup_usernames(user_list=USERS[j], proxies=proxies, auth=auth) USERS[j] = set([]) for jj in range(len(user_info)): USERS[j].add(int(str(user_info[jj]['id']))) else: # All other lists will contain user ids user_info = pyTweet.user_lookup_userids(user_list=USERS[j], proxies=proxies, auth=auth) # Get friends, followers, and timelines of each user in user_info for k in range(len(user_info)): id = str(user_info[k]['id']) # Check to see that the user's friend/follower list hasn't already been collected if id in place_savers['finished_users'].keys(): # Load previously saved user data pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_' + str(place_savers['finished_users'][id]) + '.json') if os.path.getsize(pro_filename) == 0: # File exists but it is empty user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) else: try: # Open and read profile .json jfid = open(pro_filename) user_data = ujson.load(jfid) user_data['DOC'] = datetime.datetime.utcnow() jfid.close() except ValueError: # Fail at opening profile .json, resave it user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) else: # The user's profile has not been collected...start now place_savers['finished_users'][id] = str(uuid.uuid4()) pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_{}.json'.format(str(place_savers['finished_users'][id]))) # Add user information: hop, DOC user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) print "\tSaved user {} information in {}.".format(id, pro_filename) # Collect user friends if 'friends_list' not in user_data: friends_list = [] if (user_data['friends_count'] > 0) and ((MAX_FRIENDS is None) or (MAX_FRIENDS > 0)): print "\tCollect friends for user {}.".format(id) friends_list = pyTweet.get_user_friends(user_id=id, limit=MAX_FRIENDS, proxies=proxies, auth=auth) user_data['friends_list'] = friends_list fast_save(filename=pro_filename, obj=user_data) place_savers['next_user_list'].difference_update(set(user_data['friends_list'][0:hop_out_limits['friends']])) # Add friends to next_user_list save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Collect user followers if 'followers_list' not in user_data: followers_list = [] if (user_data['followers_count'] > 0) and ((MAX_FOLLOWERS is None) or (MAX_FOLLOWERS > 0)): print "\tCollect followers for user {}.".format(id) followers_list = pyTweet.get_user_followers(user_id=id, limit=MAX_FOLLOWERS, proxies=proxies, auth=auth) user_data['followers_list'] = followers_list fast_save(filename=pro_filename, obj=user_data) place_savers['next_user_list'].difference_update(set(user_data['followers_list'][0:hop_out_limits['followers']])) # Add followers to next_user_list save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Collect timeline for user beginning from start_date tl_filename = os.path.join(save_dir['twitter_timelines'], 'timeline_{}.json'.format(place_savers['finished_users'][id])) if os.path.isfile(tl_filename): print "\tThe timeline for user {} has already been collected.".format(id) # Load timeline file if os.path.getsize(tl_filename) == 0: continue # Skip empty time lines try: jfid = open(tl_filename) tldata = ujson.load(jfid) jfid.close() except (IOError, ValueError): # Fail at opening file, recollect time line print "\tCollect the timeline for user {}.".format(id) tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow() fast_save(filename=tl_filename, obj=tldata) else: print "\tCollect the timeline for user {}.".format(id) tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow() fast_save(filename=tl_filename, obj=tldata) # Pull out user mentions, if applicable if ('user_mention_id' in hop_out_limits) and ((hop_out_limits['user_mention_id'] > 0) or (hop_out_limits['user_mention_id'] is None)): print "\tAdd user mentions to the next hop" tl_mentions = pyTweet.pull_timeline_entitites(timeline=tldata, type='user_mention_id', limit=hop_out_limits['user_mention_id']) place_savers['next_user_list'].update(tl_mentions) save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Pull out user replies, if applicable if ('in_reply_to_user_id' in hop_out_limits) and ((hop_out_limits['in_reply_to_user_id'] > 0) or (hop_out_limits['in_reply_to_user_id'] is None)): print "\tAdd replies to the next hop" tl_replies = pyTweet.pull_timeline_entitites(timeline=tldata, type='in_reply_to_user_id', limit=hop_out_limits['in_reply_to_user_id']) place_savers['next_user_list'].update(tl_replies) save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Check data amount and quit if graph has reached limit if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_out_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data']) return # Remove finished_users from place_savers['next_user_list'] place_savers['next_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys()))) # Prepare for next iteration of hop place_savers['cur_user_list'] = place_savers['next_user_list'] place_savers['next_user_list'] = set([]) place_savers['cur_hop'] += 1 save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) print "There are ", len(place_savers['cur_user_list']), " users in the next iteration of users." print "\nDone building graph!"
def depth_first_cascade_search(user_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}): """ This funciton builds a network based on users relevant to seed keywords Requires that a PostgreSQL database already exist :param user_seed: List of user names :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection :param tl_end_date: End date (datetime.date object) of timelines in collection :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database :param host: :param port: :param save_dir: Directory storing sampling place savers and growth parameters EX. save_dir = {'place_saver_filename': 'name of file'} :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to make a graph with 'max_hops'. EX. hop_limits = {'max_hops': 2} # Maximum number of hops in graph """ # CHECK PARAMETERS print "\nCheck parameters" # Timeline start and end dates assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))." assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date." # Check PostgreSQL parameters assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'." try: conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values()))) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) except psycopg2.OperationalError: print "OperationalError: Check your login credentials. Make sure the database exists as well." return # Check hop_limits dictionary if 'max_hops' not in hop_limits: hop_limits['max_hops'] = 5 print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops']) # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) # SET UP SECONDARY PARAMETERS # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # Load growth parameters growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles']) # API AUTHORIZATION print "\nAPI Authorization" OAUTH = pyTweet.get_authorization(twitter_keys) # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS print "\nConfigure database for TF-IDF analysis" json_to_database.configure_database_to_build_network(cur, conn) new_columns = [{'table': 'users', 'col': 'decision_tfidf', 'type': 'FLOAT'}] # used for i in new_columns: try: json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type'])) except psycopg2.ProgrammingError: conn.rollback() # SAMPLING LOOP print "\nBegin collection" cur_hop = place_savers['cur_hop'] for ii in range(cur_hop, hop_limits['max_hops']): print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list'])) if ii < 1: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii) # Replace user names in place_savers['cur_user_list'] with user IDs! user_id_set = set([]) for jj in place_savers['cur_user_list']: cur.execute("SELECT user_id FROM users WHERE screen_name = '{}';".format(jj)) user_id_set.add(cur.fetchone()[0]) place_savers['cur_user_list'] = set(user_id_set) del user_id_set breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) else: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii) # Do not expand users who have more than 1000 friends+followers json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # GET TIMELINES for jj in place_savers['cur_user_list']: _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Add all of the hashtags from the seed of users if ii < 1: total_ht_h0 = 0 print "\nAdd all of the hashtags from the seed of users" json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (khop=0) AND (expand_user IS NULL);") cur.execute("SELECT DISTINCT user_id FROM users WHERE (khop=0) AND (expand_user IS TRUE);") q = cur.fetchall() for qq in q: nAdd = _add_all_hashtags(cur=cur, conn=conn, user_id=qq[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii) total_ht_h0 = total_ht_h0 + nAdd print "Added {} hashtags from hop {}.".format(total_ht_h0, ii) cur.execute("SELECT COUNT(*) FROM users WHERE (khop=0) AND (expand_user=TRUE);") print "Expand {} users from hop {}".format(cur.fetchone()[0], ii) # SAVE GRAPH PARAMS growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list']) growth_params['h{}_missing.json'.format(ii)] = set([]) growth_params['h{}_extendTRUE.json'.format(ii)] = set([]) growth_params['h{}_extendFALSE.json'.format(ii)] = set([]) growth_params['h{}_extendNULL.json'.format(ii)] = set([]) for uu in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: growth_params['h{}_missing.json'.format(ii)].add(uu) continue if q[0] is None: growth_params['h{}_extendNULL.json'.format(ii)].add(uu) elif q[0] is True: growth_params['h{}_extendTRUE.json'.format(ii)].add(uu) elif q[0] is False: growth_params['h{}_extendFALSE.json'.format(ii)].add(uu) else: print "ERROR in saving growth parameters! Invalid data type..." continue breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) # GET NEXT SET OF USERS if ii < (hop_limits['max_hops'] - 1): print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS" new_um = set([]) # Add user mentions to next hop for jj in place_savers['cur_user_list']: cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (users.user_id = %s) AND (tweets.created_at >= %s AND tweets.created_at <= %s) AND (users.expand_user IS TRUE) AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (jj, tl_start_date, tl_end_date))) uids = cur.fetchall() for kk in uids: new_um.update(set(kk[0])) print "There are {} user mentions from hop {}".format(len(new_um), ii) # Get user mention profiles _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Expand, or not, user mentions json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # Expand remaining user mentions growth_params['h{}_um_missing.json'.format(ii)] = set([]) growth_params['h{}_um_extendTRUE.json'.format(ii)] = set([]) growth_params['h{}_um_extendFALSE.json'.format(ii)] = set([]) growth_params['h{}_um_extendNULL.json'.format(ii)] = set([]) new_um_tracker = set(new_um) for uu in new_um_tracker: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: new_um.remove(uu) growth_params['h{}_um_missing.json'.format(ii)].add(uu) continue if q[0] is False: new_um.remove(uu) growth_params['h{}_um_extendFALSE.json'.format(ii)].add(uu) else: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=TRUE WHERE (user_id = {}) AND (expand_user IS NULL);".format(uu)) new_um_tracker = set(new_um) for uu in new_um_tracker: cur.execute("SELECT expand_user FROM users WHERE user_id = {};".format(uu)) q = cur.fetchone() if q is None: # print "this is strange" new_um.remove(uu) growth_params['h{}_um_missing.json'.format(ii)].add(uu) continue if q[0] is True: growth_params['h{}_um_extendTRUE.json'.format(ii)].add(uu) if q[0] is None: growth_params['h{}_um_extendNULL.json'.format(ii)].add(uu) new_um.remove(uu) print "This is not supposed to happen!!!" del new_um_tracker assert (len(growth_params['h{}_um_extendNULL.json'.format(ii)]) < 1), "There are user mentions assigned expand_user=NULL!" place_savers['next_user_list'].update(new_um) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS" print "Collect friends" for jj in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (friends_count > 0) AND (friends_list IS NULL);".format(jj)) q = cur.fetchone() if q is None: continue if q[0] is True: print "\tCollect friends for user {}.".format(jj) friends_list = pyTweet.get_user_friends(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, jj))) print "Collect followers" for jj in place_savers['cur_user_list']: cur.execute("SELECT expand_user FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (followers_count > 0) AND (followers_list IS NULL);".format(jj)) q = cur.fetchone() if q is None: continue if q[0] is True: print "\tCollect followers for user {}.".format(jj) followers_list = pyTweet.get_user_followers(user_id=jj, proxies=proxies, auth=OAUTH, limit=1000) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, jj))) print "Get profiles and timelines of friends and followers" fids = set([]) for jj in place_savers['cur_user_list']: cur.execute("SELECT friends_list,followers_list FROM users WHERE (user_id = {}) AND (expand_user IS TRUE) AND (((friends_list IS NOT NULL) AND (ARRAY_LENGTH(friends_list,1) > 0)) OR ((followers_list IS NOT NULL) AND (ARRAY_LENGTH(followers_list,1) > 0)));".format(jj)) flist = cur.fetchone() if flist is None: continue if flist[0] is not None: fids.update(flist[0]) if flist[1] is not None: fids.update(flist[1]) print "There are {} friends/followers of hop {}".format(len(fids), ii) # Get profiles of friends/followers _get_profiles_wrapper(cur=cur, conn=conn, user_list=fids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Filter with high degree rule and get timelines json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user=FALSE WHERE (friends_count+followers_count > 1000) AND (expand_user IS NULL);") # Remove expand_user=FALSE from friend/follower list growth_params['h{}_frfo_missing.json'.format(ii)] = set([]) growth_params['h{}_frfo_extendFALSE.json'.format(ii)] = set([]) jj_list = list(fids) for jj in jj_list: cur.execute("SELECT expand_user,has_timeline FROM users WHERE user_id = {};".format(jj)) q = cur.fetchone() if q is None: growth_params['h{}_frfo_missing.json'.format(ii)].add(jj) fids.remove(jj) continue if q[0] is False: fids.remove(jj) growth_params['h{}_frfo_extendFALSE.json'.format(ii)].add(jj) continue if (q[0] is not False) and (q[1] is None): _get_timeline_wrapper(cur=cur, conn=conn, user_id=jj, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) del jj_list # Find the most similar friends/followers, and expand the top 5% original_frfo_set = candid_tfidf.find_most_similar_followers(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, user_ids=fids, prev_users=place_savers['cur_user_list']) growth_params['h{}_frfo_extendTRUE.json'.format(ii)] = set(fids) growth_params['h{}_frfo_extendNULL.json'.format(ii)] = set(original_frfo_set.difference(fids)) del original_frfo_set breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) place_savers['next_user_list'].update(fids) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) if ii > 0: print "\nFIND USERS WITH AT LEAST ONE HASHTAG IN COMMON WITH TOPICS" new_relevant_users = _find_relevant_users(cur=cur, conn=conn, user_ids=growth_params['h{}_frfo_extendNULL.json'.format(ii)]) growth_params['h{}_relevant_extendTRUE.json'.format(ii)] = set(new_relevant_users) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) place_savers['next_user_list'].update(new_relevant_users) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # PREPARE FOR NEXT HOP place_savers['cur_hop'] += 1 place_savers['cur_user_list'] = set(place_savers['next_user_list']) place_savers['next_user_list'] = set([]) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}): """ This function creates a network based on Twitter friends :param user_seed: List of user names :param host: Your host IP :param port: Your port :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX.hop_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format( save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format( save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''): save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines') print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format( save_dir['twitter_timelines']) if not os.path.isdir(save_dir['twitter_timelines']): print "\tThe directory {} does not exist...creating it now".format( save_dir['twitter_timelines']) os.mkdir(save_dir['twitter_timelines']) # Checkhop_limits dictionary hop_limits_defaults = { 'max_hops': [6, 'the maximin number of hops in graph'], 'max_data': [2, 'the maximin amount of data collected (in GB)'], 'friends': [0, 'max friends per user to include in next hop'], 'followers': [0, 'max followers per user to include in next hop'], 'in_reply_to_user_id': [ 0, "max 'in_reply_to_user_id' per user's timeline to include in next hop" ], 'user_mention_id': [0, "max 'user_mention_id' per user's timeline to include in next ho"] } for kk in hop_limits_defaults.keys(): if kk not in hop_limits: hop_limits[kk] = hop_limits_defaults[kk][0] print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format( kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0]) # Check data amount and quit if graph has reached limit if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits['max_data']) return # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format( len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) print "\tWe will collect {} users in hop {}".format( len(place_savers['cur_user_list']), place_savers['cur_hop']) # Load growth parametes growth_params = load_growth_params(save_dir['twitter_profiles']) # Create proxies dictionary proxies = { 'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port) } # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API authorization auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for khop in range(place_savers['cur_hop'], hop_limits['max_hops']): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format( khop, len(place_savers['cur_user_list'])) print "Create the user list of the {}th-hop users as well.".format( khop + 1) # Get profile information of users in cur_user_list print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS" if khop < 1: # Find profiles to collect profiles_to_collect = set(place_savers['cur_user_list']) for json_filename in place_savers['finished_users'].values(): data = ujson.load( open( os.path.join(save_dir['twitter_profiles'], json_filename), 'r')) if data['screen_name'] in place_savers['cur_user_list']: profiles_to_collect.discard(data['screen_name']) # Collect and save profiles user_info = [] if len(profiles_to_collect) > 0: print "\nstart collecting profiles: {} profiles".format( len(profiles_to_collect)) user_info = pyTweet.user_lookup_usernames( user_list=list(profiles_to_collect), proxies=proxies, auth=auth) if isinstance(user_info, dict) and ('errors' in user_info.keys()): print "\nThe initial seed cannot be collected..." print "Twitter error message: ", user_info # Save profile information # print "user_info: ", user_info # print type(user_info) for udata in user_info: # print "udata: ", udata # print type(udata) json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][udata['id']] = json_filename # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well all_screennames = { } # Keys are screen names and values are file name jsons = filter(lambda k: re.match('userInfo_*', k), os.listdir(save_dir['twitter_profiles'])) for jj in jsons: try: full_filename = os.path.join(save_dir['twitter_profiles'], jj) if os.path.getsize(full_filename) != 0: jfid = open(full_filename) profile = ujson.load(jfid) jfid.close() all_screennames[profile['screen_name']] = jj except ValueError: continue # Get corresponding user IDs for each screen name in cur_user_list cur_user_list_ids = set([]) for scn_name in profiles_to_collect.union( place_savers['cur_user_list']): if scn_name in all_screennames.keys(): jfid = open( os.path.join(save_dir['twitter_profiles'], all_screennames[scn_name])) profile = ujson.load(jfid) jfid.close() if 'id' in profile: cur_user_list_ids.add(int(profile['id'])) else: place_savers['unavailable_accounts'].add(scn_name) else: place_savers['unavailable_accounts'].add(scn_name) print cur_user_list_ids del profiles_to_collect place_savers['cur_user_list'] = set(cur_user_list_ids) else: # Collect and save profiles profiles_to_collect = set( place_savers['cur_user_list']).difference( set(map(int, place_savers['finished_users'].keys()))) user_info = pyTweet.user_lookup_userids( user_list=list(profiles_to_collect), proxies=proxies, auth=auth) for udata in user_info: json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][udata['id']] = json_filename # Update current user list, and identify unavailable accounts new_cur_user_list = set([]) for uid in profiles_to_collect.union( set(place_savers['cur_user_list'])): if uid in place_savers['unavailable_accounts']: continue if uid in place_savers['finished_users'].keys(): new_cur_user_list.add(uid) else: place_savers['unavailable_accounts'].add(uid) place_savers['cur_user_list'] = set(new_cur_user_list) del new_cur_user_list # Save place saving variables growth_params['h{}_users.json'.format(khop)] = set( place_savers['cur_user_list']) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # Get timelines for each user in user_info print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS" for uid in place_savers['cur_user_list']: if uid in place_savers['finished_users'].keys(): profile_filename = place_savers['finished_users'][uid] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) try: tldata = ujson.load(open(profile_filename, 'r')) tldata['id'] except (IOError, KeyError): # Collect user data user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][uid] = json_filename if ('has_timeline' in tldata.keys()) and (tldata['has_timeline'] is True): continue if not os.path.isfile(timeline_filename): print "Collect the timeline for user {}.".format(uid) tldata = pyTweet.collect_user_timeline( USER=uid, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow( ).strftime("%m-%d-%Y %H:%M:%S %z") tldata[tl]['has_timeline'] = True fast_save(filename=profile_filename, obj=tldata) print "\nGet friends of each user in cur_user_list" if hop_limits['friends'] != 0: growth_params["h{}_friends.json".format(khop)] = set([]) print "\nCOLLECT FRIENDS OF CURRENT USER SET" # print "place_savers['cur_user_list']: ", place_savers['cur_user_list'] for jj in place_savers['cur_user_list']: profile_filename = os.path.join( save_dir['twitter_profiles'], place_savers['finished_users'][jj]) try: data = ujson.load(open(profile_filename, 'r')) # print data['id'] except (IOError, KeyError, TypeError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][uid] = json_filename if data['friends_count'] < 1: data['friends_list'] = [] fast_save(filename=profile_filename, obj=data) continue if 'friends_list' not in data.keys(): print "Collect friends for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits['friends'], proxies=proxies, auth=auth) data['friends_list'] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits['friends'] < len(data['friends_list']): place_savers['next_user_list'].update( set(data['friends_list'][0:len(hop_limits['friends']) - 1])) else: place_savers['next_user_list'].update( set(data['friends_list'])) growth_params["h{}_friends.json".format(khop)].update( set(data['friends_list'])) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) print "\nGet followers of each user in the cur_user_list" if hop_limits['followers'] != 0: growth_params["h{}_followers.json".format(khop)] = set([]) print "\nCOLLECT FOLLOWERS OF CURRENT USER SET" for jj in place_savers['cur_user_list']: # profile_filename = place_savers['finished_users'][jj] profile_filename = os.path.join( save_dir['twitter_profiles'], place_savers['finished_users'][jj]) try: data = ujson.load(open(profile_filename, 'r')) data['id'] except (IOError, KeyError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) if data['followers_count'] < 1: data['followers_list'] = [] fast_save(filename=profile_filename, obj=data) continue if 'followers_list' not in data.keys(): print "Collect followers for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits['followers'], proxies=proxies, auth=auth) data['followers_list'] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits['followers'] < len(data['followers_list']): place_savers['next_user_list'].update( set(data['followers_list'] [0:len(hop_limits['followers']) - 1])) else: place_savers['next_user_list'].update( set(data['followers_list'])) growth_params["h{}_followers.json".format(khop)].update( set(data['followers_list'])) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Pull out user mentions if ('user_mention_id' in hop_limits) and (hop_limits['user_mention_id'] != 0): print "\nCOLLECT USER MENTIONS OF CURRENT SET" growth_params["h{}_user_mentions.json".format(khop)] = set([]) for jj in place_savers['cur_user_list']: profile_filename = place_savers['finished_users'][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue tldata[0]['text'] except (IOError, KeyError): # Fix timeline file _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) tldata = ujson.load(open(timeline_filename, 'r')) else: # Get the timeline data _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue # Pull out user mentions tl_mentions = pyTweet.pull_timeline_entitites( timeline=tldata, type='user_mention_id', limit=hop_limits['user_mention_id']) growth_params["h{}_user_mentions.json".format(khop)].update( tl_mentions) place_savers['next_user_list'].update(tl_mentions) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Pull out replies if ('in_reply_to_user_id' in hop_limits) and (hop_limits['in_reply_to_user_id'] != 0): print "\nCOLLECT USERS CURRENT SET REPLIES TO" growth_params["h{}_replies.json".format(khop)] = set([]) for jj in place_savers['cur_user_list']: profile_filename = place_savers['finished_users'][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, 'r')) tldata[0]['text'] except (IOError, KeyError): # Fix timeline file _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue else: # Get the timeline data _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue # Pull out replies tl_replies = pyTweet.pull_timeline_entitites( timeline=tldata, type='in_reply_to_user_id', limit=hop_limits['in_reply_to_user_id']) place_savers['next_user_list'].update(tl_replies) growth_params["h{}_replies.json".format(khop)].update( tl_replies) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Check data limit if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits['max_data']) return # Prepare for next iteration place_savers['cur_hop'] = khop + 1 place_savers['cur_user_list'] = set(place_savers['next_user_list']) place_savers['next_user_list'] = set([]) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)
def depth_first_causal_search(user_seed, topic_seed, tl_start_date, tl_end_date, postgres_params, host, port, save_dir={}, hop_limits={}, collection_limits={}): """ This funciton builds a network based on users relevant to seed keywords Requires that a PostgreSQL database already exist :param user_seed: List of user names :param topic_seed: List of seed topics :param tl_start_date: Beginning of date (datetime.date object) of timelines in collection :param tl_end_date: End date (datetime.date object) of timelines in collection :param postgres_params: Dictionary containing the fields '', ..., required to connect to a database :param host: :param port: :param save_dir: Directory to save sampling and growth parameters :param hop_limits: Specify your graph constrains with the variable hop_limits. Set the maximum number of hops to make a graph with 'max_hops'. EX. hop_limits = {'max_hops': 2} # Maximum number of hops in graph :param collection_limits: Specify the term-frequency calculation and threshold percentile EX. collection_limits = {'threshold_percentile': 0.05, # Threshold percentile for .... 'tf_type': 'raw'} # TF caclulation type """ # CHECK PARAMETERS print "\nCheck parameters" # Timeline start and end dates assert (isinstance(tl_start_date, datetime.date) and isinstance(tl_end_date, datetime.date)), "Both tl_start_date and tl_end_date must be datetime.date objects (i.e. tl_start_date = datetime.date(year=2014, month=1, day=1))." assert ((tl_end_date - tl_start_date) > datetime.timedelta(0)), "The end date must be later than the start date. Check the assignments of tl_start_date and tl_end_date." # Check PostgreSQL parameters assert (('dbname' in postgres_params.keys()) and ('user' in postgres_params.keys()) and ('password' in postgres_params.keys())), "Verify the parameters. The possible fields are 'dbname', 'user', 'password', 'host', and 'port'." try: conn = psycopg2.connect(" ".join(map(lambda x,y: "{}='{}'".format(x,y), postgres_params.keys(),postgres_params.values()))) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) except psycopg2.OperationalError: print "OperationalError: Check your login credentials. Make sure the database exists as well." return # Check hop_limits dictionary if 'max_hops' not in hop_limits: hop_limits['max_hops'] = 5 print "\tNo value was specified for hop_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_limits['max_hops']) # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) # Check collection_limits dictionary if 'threshold_percentile' not in collection_limits: collection_limits['threshold_percentile'] = 0.05 print "\tNo value was specified for collection_limits['threshold_percentile'], xxx, so it will be set to 0.05." assert (0 <= collection_limits['threshold_percentile'] <= 1), "The value collection_parameters['threshold_percentile'] must fall within [0,1]." if 'tf_type' not in collection_limits: collection_limits['tf_type'] = 'raw' print "\tNo value was specified for collection_limits['tf_type'], method of calculating the term frequency, so it will be set to 'raw'." assert ((collection_limits['tf_type'] == 'raw') or (collection_limits['tf_type'] == 'augmented') or (collection_limits['tf_type'] == 'boolean')), "The value collection_parameters['tf_type'] is not recognized. Please enter 'raw', 'boolean' or 'augmented' as it's value." # SET UP SECONDARY PARAMETERS # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = breadth_first_sampling.load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) # Load growth parameters growth_params = breadth_first_sampling.load_growth_params(save_dir['twitter_profiles']) # API AUTHORIZATION print "\nAPI Authorization" OAUTH = pyTweet.get_authorization(twitter_keys) print "Start with key {}".format(OAUTH['KEY_FILE']) # CONFIGURE SCHEMA FOR TF-IDF ANALYSIS print "\nConfigure database for TF-IDF analysis" json_to_database.configure_database_to_build_network(cur, conn) # Load topics for t in topic_seed: if (t is None) or (t.strip() == ''): continue json_to_database.make_sql_edit(cur, conn, "INSERT INTO topics (topic, khop) VALUES ('{}', -1);".format(t.strip())) # Add columns for this sampling method new_columns = [{'table': 'users', 'col': 'has_timeline_filter', 'type': 'BOOLEAN'}, # Indicates if a user's timeline has already been filtered {'table': 'users', 'col': 'timeline_document', 'type': 'TEXT[]'}, # Document created from relevant tweets {'table': 'topics', 'col': 'document_frequency', 'type': 'FLOAT'}, # Docuemnt frequency {'table': 'users', 'col': 'decision_candid_tfdf_score', 'type': 'FLOAT'}] for i in new_columns: try: json_to_database.make_sql_edit(cur, conn, "ALTER TABLE {} ADD {} {};".format(i['table'], i['col'], i['type'])) print "Add column {} to table {}.".format(i['col'], i['table']) except psycopg2.ProgrammingError: conn.rollback() new_ind = [{'table': 'users', 'col': 'has_timeline_filter'}] # SAMPLING LOOP print "\nBegin collection" cur_hop = place_savers['cur_hop'] for ii in range(cur_hop, hop_limits['max_hops']): print "\nWorking on collecting hop {} containing {} profiles.".format(ii, len(place_savers['cur_user_list'])) # GET PROFILE INFORMATION if ii < 1: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='screen_name', hop=ii) else: _get_profiles_wrapper(cur=cur, conn=conn, user_list=place_savers['cur_user_list'], proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii) growth_params['h{}_users.json'.format(ii)] = set(place_savers['cur_user_list']) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) # GET TIMELINES cur.execute("SELECT user_id FROM users WHERE (has_timeline IS NULL) AND (expand_user = TRUE OR expand_user IS NULL) AND (khop = {});".format(ii)) uids = cur.fetchall() for j in uids: # Get timeline _get_timeline_wrapper(cur=cur, conn=conn, user_id=j[0], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter users by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # Expand relevant seed users json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE khop = 0 AND expand_user IS NULL;") # GET NEXT SET OF USERS if ii < (hop_limits['max_hops'] - 1): # USER MENTIONS print "\nCHOOSE NEXT SET OF USERS FROM USER MENTIONS" new_um = set([]) cur.execute(cur.mogrify("SELECT DISTINCT tweets.user_mentions FROM tweets INNER JOIN users ON users.user_id=tweets.user_id WHERE (tweets.created_at >= %s AND tweets.created_at <= %s) AND users.expand_user = TRUE AND users.khop = %s AND (tweets.user_mentions IS NOT NULL OR tweets.user_mentions != '{}');", (tl_start_date, tl_end_date, ii))) uids = cur.fetchall() for t in uids: new_um = new_um.union(set(t[0])) # Get user mention profiles _get_profiles_wrapper(cur=cur, conn=conn, user_list=new_um, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) # Get timelines for um in new_um: _get_timeline_wrapper(cur=cur, conn=conn, user_id=um, tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter users by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # User mentions who have expand_user = NULL, will be set to TRUE for um in new_um: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET expand_user = TRUE WHERE expand_user IS NULL AND user_id = {};".format(um)) growth_params['h{}_user_mentions.json'.format(ii)] = set(new_um) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) del new_um # FRIENDS AND FOLLOWERS print "\nCHOOSE NEXT SET OF USERS FROM FRIENDS AND FOLLOWERS" print "Collect friends" cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND friends_count > 0 AND friends_list IS NOT NULL;".format(ii)) hasfriends = cur.fetchall() growth_params['h{}_friends.json'.format(ii)] = set([]) for u in hasfriends: print "\nCollect friends for user {}.".format(u[0]) friends_list = pyTweet.get_user_friends(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET friends_list = %s WHERE user_id = %s;", (friends_list, u[0]))) growth_params['h{}_friends.json'.format(ii)].update(set(friends_list)) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "Collect followers" cur.execute("SELECT user_id FROM users WHERE expand_user = TRUE AND khop = {} AND followers_count > 0 AND followers_list IS NOT NULL;".format(ii)) hasfollowers = cur.fetchall() growth_params['h{}_followers.json'] = set([]) for u in hasfollowers: print "\nCollect followers for user {}.".format(u[0]) followers_list = pyTweet.get_user_followers(user_id=u[0], proxies=proxies, auth=OAUTH, limit=100) json_to_database.make_sql_edit(cur, conn, cur.mogrify("UPDATE users SET followers_list = %s WHERE user_id = %s;", (followers_list, u[0]))) growth_params['h{}_followers.json'].update(set(followers_list)) breadth_first_sampling.save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=ii) print "Get profiles and timelines of friends and followers" cur.execute("SELECT user_id, friends_list,followers_list FROM users WHERE expand_user = TRUE AND khop = {} AND (ARRAY_LENGTH(friends_list, 1) > 0 OR ARRAY_LENGTH(followers_list, 1) > 0);".format(ii)) flist = cur.fetchall() ids = set([]) for f in flist: if f is not None: if f[1] is not None: ids.update(f[1]) if f[2] is not None: ids.update(f[2]) ids = list(ids) # Get profiles of friends/followers _get_profiles_wrapper(cur=cur, conn=conn, user_list=ids, proxies=proxies, auth=OAUTH, list_type='user_id', hop=ii+1) for i in range(len(ids)): print "\nGet timeline for friend/follower {}: {} out of {}".format(ids[i], i, len(ids)) _get_timeline_wrapper(cur=cur, conn=conn, user_id=ids[i], tl_start_date=tl_start_date, proxies=proxies, auth=OAUTH) # Filter profiles by timeline _filter_by_timeline(cur=cur, conn=conn, tl_start_date=tl_start_date, tl_end_date=tl_end_date, khop=ii+1) # Create documents from timelines candid_tfidf.create_documents(cur, conn, tl_start_date, tl_end_date) # Compute CANDID information score, and discriminate users for f in flist: if (f is not None) and (f[0] is not None): candid_tfidf.compute_candid_score(cur=cur, conn=conn, parent_id=f[0], tl_start_date=tl_start_date, tl_end_date=tl_end_date, threshold_percentile=collection_limits['threshold_percentile'], tf_type=collection_limits['tf_type']) # PREPARE FOR NEXT HOP place_savers['cur_hop'] = ii + 1 place_savers['cur_user_list'] = set([]) cur.execute("SELECT user_id FROM users WHERE khop = {} AND expand_user = TRUE;".format(ii + 1)) new_profiles = cur.fetchall() for np in new_profiles: place_savers['cur_user_list'].add(np[0]) breadth_first_sampling.save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers)