def _get_timeline_wrapper(cur, conn, user_id, tl_start_date, proxies, auth): """ This function is a wrapper for grabbing user timelines. :param cur: Cursor to database :param conn: Connection to databaase :param user_id: Twitter user ID :param tl_start_date: Start date of timeline, datetime object :param proxies: proxy dictionary, ex. {'http': 'http://%s:%s' % (HOST, PORT), 'https': 'http://%s:%s' % (HOST, PORT)} :param auth: Twitter application authentication, see the get_authorization method """ if (user_id is None): return if isinstance(user_id, basestring) and (user_id.strip() == ''): return # Has timeline already been collected? try: cur.execute("SELECT expand_user, has_timeline FROM users WHERE user_id = {};".format(user_id)) q = cur.fetchone() except psycopg2.ProgrammingError: print "strange programming error" print "user id is ", user_id print type(user_id) # sys.utc(1) except psycopg2.InternalError: conn.rollback() cur.execute("SELECT expand_user, has_timeline FROM users WHERE user_id = {};".format(user_id)) q = cur.fetchone() if (q is None) or (q[0] is False) or (q[1] is not None): # User is not in database, or user shouldn't be expanded, or TL has already been collected return # Is the user's profile protected or deleted? cur.execute("SELECT profile_id FROM lost_profiles WHERE user_id = {};".format(user_id)) qq = cur.fetchone() if qq is not None: return # check to see if user already has timeline if (q[0] is not False) and (q[1] is None): print '\tGet user timeline for user ', user_id TL = pyTweet.collect_user_timeline(USER=user_id, USER_type='user_id', start_date=tl_start_date, proxies=proxies, auth=auth) # Ignore empty TL if TL == []: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline=FALSE, timeline_is_relevant=FALSE WHERE user_id = {};".format(user_id)) return # add date of collection to time line for tl in range(len(TL)): TL[tl]['DOC'] = datetime.datetime.utcnow() # Update has_tl tweetAdded = json_to_database.add_timeline(cur=cur, conn=conn, tldata=TL) if tweetAdded: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline = TRUE WHERE user_id = {};".format(user_id)) else: json_to_database.make_sql_edit(cur, conn, "UPDATE users SET has_timeline = FALSE, timeline_is_relevant = FALSE WHERE user_id = {};".format(user_id))
def _save_timeline_json(user_id, filename, start_date, proxies, auth): """ This function fixes a timeline JSON in case that its previous saving process was interrupted. :param user_id: Twitter user ID :param filename: Filename to save timeline JSON :param start_date: Start date of timeline, datetime object :param proxies: proxy dictionary, ex. {'http': 'http://%s:%s' % (HOST, PORT), 'https': 'http://%s:%s' % (HOST, PORT)} :param auth: Twitter application authentication, see the get_authorization method """ assert "timeline" in filename, "The file is named {}. Are you sure that this is a valid timeline JSON?" tl_info = pyTweet.collect_user_timeline( USER=user_id, USER_type="user_id", start_date=start_date, proxies=proxies, auth=auth ) for tt in tl_info: tt["DOC"] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z") fast_save(filename=filename, obj=tl_info)
def _save_timeline_json(user_id, filename, start_date, proxies, auth): """ This function fixes a timeline JSON in case that its previous saving process was interrupted. :param user_id: Twitter user ID :param filename: Filename to save timeline JSON :param start_date: Start date of timeline, datetime object :param proxies: proxy dictionary, ex. {'http': 'http://%s:%s' % (HOST, PORT), 'https': 'http://%s:%s' % (HOST, PORT)} :param auth: Twitter application authentication, see the get_authorization method """ assert ( 'timeline' in filename ), "The file is named {}. Are you sure that this is a valid timeline JSON?" tl_info = pyTweet.collect_user_timeline(USER=user_id, USER_type='user_id', start_date=start_date, proxies=proxies, auth=auth) for tt in tl_info: tt['DOC'] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z") fast_save(filename=filename, obj=tl_info)
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}): """ This function creates a network based on Twitter friends :param user_seed: List of user names :param host: Your host IP :param port: Your port :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX.hop_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ("twitter_profiles" not in save_dir.keys()) or (save_dir["twitter_profiles"].strip() == ""): save_dir["twitter_profiles"] = os.path.join(os.getcwd(), "profiles") print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format( save_dir["twitter_profiles"] ) if not os.path.isdir(save_dir["twitter_profiles"]): print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_profiles"]) os.mkdir(save_dir["twitter_profiles"]) if ("twitter_timelines" not in save_dir.keys()) or (save_dir["twitter_timelines"].strip() == ""): save_dir["twitter_timelines"] = os.path.join(os.getcwd(), "timelines") print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format( save_dir["twitter_timelines"] ) if not os.path.isdir(save_dir["twitter_timelines"]): print "\tThe directory {} does not exist...creating it now".format(save_dir["twitter_timelines"]) os.mkdir(save_dir["twitter_timelines"]) # Checkhop_limits dictionary hop_limits_defaults = { "max_hops": [6, "the maximin number of hops in graph"], "max_data": [2, "the maximin amount of data collected (in GB)"], "friends": [0, "max friends per user to include in next hop"], "followers": [0, "max followers per user to include in next hop"], "in_reply_to_user_id": [0, "max 'in_reply_to_user_id' per user's timeline to include in next hop"], "user_mention_id": [0, "max 'user_mention_id' per user's timeline to include in next ho"], } for kk in hop_limits_defaults.keys(): if kk not in hop_limits: hop_limits[kk] = hop_limits_defaults[kk][0] print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format( kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0] ) # Check data amount and quit if graph has reached limit if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None): data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"]) if data_vol > hop_limits["max_data"]: print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits["max_data"] ) return # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir["twitter_profiles"]) print "\tAs of now {} user profiles have been collected and saved to {}".format( len(place_savers["finished_users"]), save_dir["twitter_profiles"] ) print "\tThe current hop is {}".format(place_savers["cur_hop"]) if place_savers["cur_hop"] < 1: place_savers["cur_user_list"] = set(user_seed) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) print "\tWe will collect {} users in hop {}".format(len(place_savers["cur_user_list"]), place_savers["cur_hop"]) # Load growth parametes growth_params = load_growth_params(save_dir["twitter_profiles"]) # Create proxies dictionary proxies = {"http": "http://%s:%s" % (host, port), "https": "http://%s:%s" % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API authorization auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for khop in range(place_savers["cur_hop"], hop_limits["max_hops"]): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format( khop, len(place_savers["cur_user_list"]) ) print "Create the user list of the {}th-hop users as well.".format(khop + 1) # Get profile information of users in cur_user_list print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS" if khop < 1: # Find profiles to collect profiles_to_collect = set(place_savers["cur_user_list"]) for json_filename in place_savers["finished_users"].values(): data = ujson.load(open(os.path.join(save_dir["twitter_profiles"], json_filename), "r")) if data["screen_name"] in place_savers["cur_user_list"]: profiles_to_collect.discard(data["screen_name"]) # Collect and save profiles user_info = [] if len(profiles_to_collect) > 0: print "\nstart collecting profiles: {} profiles".format(len(profiles_to_collect)) user_info = pyTweet.user_lookup_usernames( user_list=list(profiles_to_collect), proxies=proxies, auth=auth ) if isinstance(user_info, dict) and ("errors" in user_info.keys()): print "\nThe initial seed cannot be collected..." print "Twitter error message: ", user_info # Save profile information # print "user_info: ", user_info # print type(user_info) for udata in user_info: # print "udata: ", udata # print type(udata) json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][udata["id"]] = json_filename # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well all_screennames = {} # Keys are screen names and values are file name jsons = filter(lambda k: re.match("userInfo_*", k), os.listdir(save_dir["twitter_profiles"])) for jj in jsons: try: full_filename = os.path.join(save_dir["twitter_profiles"], jj) if os.path.getsize(full_filename) != 0: jfid = open(full_filename) profile = ujson.load(jfid) jfid.close() all_screennames[profile["screen_name"]] = jj except ValueError: continue # Get corresponding user IDs for each screen name in cur_user_list cur_user_list_ids = set([]) for scn_name in profiles_to_collect.union(place_savers["cur_user_list"]): if scn_name in all_screennames.keys(): jfid = open(os.path.join(save_dir["twitter_profiles"], all_screennames[scn_name])) profile = ujson.load(jfid) jfid.close() if "id" in profile: cur_user_list_ids.add(int(profile["id"])) else: place_savers["unavailable_accounts"].add(scn_name) else: place_savers["unavailable_accounts"].add(scn_name) print cur_user_list_ids del profiles_to_collect place_savers["cur_user_list"] = set(cur_user_list_ids) else: # Collect and save profiles profiles_to_collect = set(place_savers["cur_user_list"]).difference( set(map(int, place_savers["finished_users"].keys())) ) user_info = pyTweet.user_lookup_userids(user_list=list(profiles_to_collect), proxies=proxies, auth=auth) for udata in user_info: json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][udata["id"]] = json_filename # Update current user list, and identify unavailable accounts new_cur_user_list = set([]) for uid in profiles_to_collect.union(set(place_savers["cur_user_list"])): if uid in place_savers["unavailable_accounts"]: continue if uid in place_savers["finished_users"].keys(): new_cur_user_list.add(uid) else: place_savers["unavailable_accounts"].add(uid) place_savers["cur_user_list"] = set(new_cur_user_list) del new_cur_user_list # Save place saving variables growth_params["h{}_users.json".format(khop)] = set(place_savers["cur_user_list"]) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) # Get timelines for each user in user_info print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS" for uid in place_savers["cur_user_list"]: if uid in place_savers["finished_users"].keys(): profile_filename = place_savers["finished_users"][uid] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) try: tldata = ujson.load(open(profile_filename, "r")) tldata["id"] except (IOError, KeyError): # Collect user data user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][uid] = json_filename if ("has_timeline" in tldata.keys()) and (tldata["has_timeline"] is True): continue if not os.path.isfile(timeline_filename): print "Collect the timeline for user {}.".format(uid) tldata = pyTweet.collect_user_timeline( USER=uid, USER_type="user_id", start_date=timeline_start_date, proxies=proxies, auth=auth ) for tl in range(len(tldata)): tldata[tl]["DOC"] = datetime.datetime.utcnow().strftime("%m-%d-%Y %H:%M:%S %z") tldata[tl]["has_timeline"] = True fast_save(filename=profile_filename, obj=tldata) print "\nGet friends of each user in cur_user_list" if hop_limits["friends"] != 0: growth_params["h{}_friends.json".format(khop)] = set([]) print "\nCOLLECT FRIENDS OF CURRENT USER SET" # print "place_savers['cur_user_list']: ", place_savers['cur_user_list'] for jj in place_savers["cur_user_list"]: profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj]) try: data = ujson.load(open(profile_filename, "r")) # print data['id'] except (IOError, KeyError, TypeError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop) json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop ) place_savers["finished_users"][uid] = json_filename if data["friends_count"] < 1: data["friends_list"] = [] fast_save(filename=profile_filename, obj=data) continue if "friends_list" not in data.keys(): print "Collect friends for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits["friends"], proxies=proxies, auth=auth ) data["friends_list"] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits["friends"] < len(data["friends_list"]): place_savers["next_user_list"].update(set(data["friends_list"][0 : len(hop_limits["friends"]) - 1])) else: place_savers["next_user_list"].update(set(data["friends_list"])) growth_params["h{}_friends.json".format(khop)].update(set(data["friends_list"])) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) print "\nGet followers of each user in the cur_user_list" if hop_limits["followers"] != 0: growth_params["h{}_followers.json".format(khop)] = set([]) print "\nCOLLECT FOLLOWERS OF CURRENT USER SET" for jj in place_savers["cur_user_list"]: # profile_filename = place_savers['finished_users'][jj] profile_filename = os.path.join(save_dir["twitter_profiles"], place_savers["finished_users"][jj]) try: data = ujson.load(open(profile_filename, "r")) data["id"] except (IOError, KeyError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ("id" not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir["twitter_profiles"], khop=khop) if data["followers_count"] < 1: data["followers_list"] = [] fast_save(filename=profile_filename, obj=data) continue if "followers_list" not in data.keys(): print "Collect followers for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits["followers"], proxies=proxies, auth=auth ) data["followers_list"] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits["followers"] < len(data["followers_list"]): place_savers["next_user_list"].update( set(data["followers_list"][0 : len(hop_limits["followers"]) - 1]) ) else: place_savers["next_user_list"].update(set(data["followers_list"])) growth_params["h{}_followers.json".format(khop)].update(set(data["followers_list"])) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Pull out user mentions if ("user_mention_id" in hop_limits) and (hop_limits["user_mention_id"] != 0): print "\nCOLLECT USER MENTIONS OF CURRENT SET" growth_params["h{}_user_mentions.json".format(khop)] = set([]) for jj in place_savers["cur_user_list"]: profile_filename = place_savers["finished_users"][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue tldata[0]["text"] except (IOError, KeyError): # Fix timeline file _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) tldata = ujson.load(open(timeline_filename, "r")) else: # Get the timeline data _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue # Pull out user mentions tl_mentions = pyTweet.pull_timeline_entitites( timeline=tldata, type="user_mention_id", limit=hop_limits["user_mention_id"] ) growth_params["h{}_user_mentions.json".format(khop)].update(tl_mentions) place_savers["next_user_list"].update(tl_mentions) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Pull out replies if ("in_reply_to_user_id" in hop_limits) and (hop_limits["in_reply_to_user_id"] != 0): print "\nCOLLECT USERS CURRENT SET REPLIES TO" growth_params["h{}_replies.json".format(khop)] = set([]) for jj in place_savers["cur_user_list"]: profile_filename = place_savers["finished_users"][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join(save_dir["twitter_timelines"], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, "r")) tldata[0]["text"] except (IOError, KeyError): # Fix timeline file _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue else: # Get the timeline data _save_timeline_json( user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth, ) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, "r")) if len(tldata) < 1: continue # Pull out replies tl_replies = pyTweet.pull_timeline_entitites( timeline=tldata, type="in_reply_to_user_id", limit=hop_limits["in_reply_to_user_id"] ) place_savers["next_user_list"].update(tl_replies) growth_params["h{}_replies.json".format(khop)].update(tl_replies) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir["twitter_profiles"], growth_obj=growth_params, cur_hop=khop) # Check data limit if ("max_data" in hop_limits) and (hop_limits["max_data"] is not None): data_vol = measure_data(user_dir=save_dir["twitter_profiles"], timeline_dir=save_dir["twitter_timelines"]) if data_vol > hop_limits["max_data"]: print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits["max_data"] ) return # Prepare for next iteration place_savers["cur_hop"] = khop + 1 place_savers["cur_user_list"] = set(place_savers["next_user_list"]) place_savers["next_user_list"] = set([]) save_place_savers(user_dir=save_dir["twitter_profiles"], place_saver_obj=place_savers)
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_out_limits={}, collection_limits={}): """ This function creates a network based on Twitter friends @param user_seed - List of user names @param host - @param port - @param timeline_start_date - Beginning of date (datetime.date object) of timelines in collection @param save_dir - Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} @param hop_limits - Specify your graph constrains with the variable hop_out_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX. hop_out_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop @param collection_limits - Suppose that you want to store friends or followers, but do not want to expand the graph based on them. Specify limitations on collecting friends and followers below. Notice that reply and mention users are saved in the timelines. The largest possible length of 'friends_list' will be the greater of hops out limit and collection limit, or MAX(hops_out_limit['friends'], collection_limits['friends']). The same description follows for 'followers_list'. EX. collection_limits = {'friends': 0, # Maximum number of friends per user to save within the profile .JSON 'followers': None} # Maximum number of followers per user to save within the profile .JSON """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format(save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''): save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines') print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format(save_dir['twitter_timelines']) if not os.path.isdir(save_dir['twitter_timelines']): print "\tThe directory {} does not exist...creating it now".format(save_dir['twitter_timelines']) os.mkdir(save_dir['twitter_timelines']) # Check data amount and quit if graph has reached limit if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_out_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data']) return # Check hop_out_limits dictionary if 'max_hops' not in hop_out_limits: hop_out_limits['max_hops'] = 6 print "\tNo value was specified for hop_out_limits['max_hops'], the maximin number of hops in graph, so it will be set to {}.".format(hop_out_limits['max_hops']) if 'max_data' not in hop_out_limits: hop_out_limits['max_data'] = 2 print "\tNo value was specified for hop_out_limits['max_data'], the maximin amount of data collected (in GB), so it will be set to {}.".format(hop_out_limits['max_data']) if 'friends' not in hop_out_limits: hop_out_limits['friends'] = 0 print "\tNo value was specified for hop_out_limits['friends'], max friends per user to include in next hop, so it will be set to 0." if 'followers' not in hop_out_limits: hop_out_limits['followers'] = 0 print "\tNo value was specified for hop_out_limits['followers'], max followers per user to include in next hop, so it will be set to 0." if 'in_reply_to_user_id' not in hop_out_limits: hop_out_limits['in_reply_to_user_id'] = 0 print "\tNo value was specified for hop_out_limits['in_reply_to_user_id'], max 'in_reply_to_user_id' per user's timeline to include in next hop, so it will be set to 0." if 'user_mention_id' not in hop_out_limits: hop_out_limits['user_mention_id'] = 0 print "\tNo value was specified for hop_out_limits['user_mention_id'], max 'user_mention_id' per user's timeline to include in next hop, so it will be set to 0." # Check collection_limits dictionary if 'friends' not in collection_limits: collection_limits['friends'] = 0 print "\tNo value was specified for collection_limits['friends'], max number of friends per user to save with the profile .JSON, so it will be set to 0." if 'followers' not in collection_limits: collection_limits['followers'] = 0 print "\tNo value was specified for collection_limits['followers'], max number of followers per user to save with the profile .JSON, so it will be set to 0." # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format(len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) print "\tWe will collect {} users in hop {}".format(len(place_savers['cur_user_list']), place_savers['cur_hop']) print "\tSo far we plan to collect {} users in hop {}".format(len(place_savers['next_user_list']), place_savers['cur_hop'] + 1) # Determine limits for friends/followers collection - if None in [hop_out_limits['friends'], collection_limits['friends']]: MAX_FRIENDS = None else: MAX_FRIENDS = max(hop_out_limits['friends'], collection_limits['friends']) if None in [hop_out_limits['followers'], collection_limits['followers']]: MAX_FOLLOWERS = None else: MAX_FOLLOWERS = max(hop_out_limits['followers'], collection_limits['followers']) # Create proxies dictionary proxies = {'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port)} # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API AUTHORIZATION print "\nAPI Authorization" auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for i in range(place_savers['cur_hop'], hop_out_limits['max_hops']): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format(i, len(place_savers['cur_user_list'])) print "Create the user list of the " + str(i+1) + "th-hop users." # Remove finished_users from next_user_list if (place_savers['cur_hop'] > 0): place_savers['cur_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys()))) # Separate list for faster results, and delete place_savers['cur_user_list'] to free space USERS = [list(place_savers['cur_user_list'])[z:z+100] for z in range(0, len(place_savers['cur_user_list']), 100)] del place_savers['cur_user_list'] # save space for j in range(len(USERS)): # Look up information of users, 100 at a time print "\tLook up user information" if i < 1: # The initial list contain user names or @handles user_info = pyTweet.user_lookup_usernames(user_list=USERS[j], proxies=proxies, auth=auth) USERS[j] = set([]) for jj in range(len(user_info)): USERS[j].add(int(str(user_info[jj]['id']))) else: # All other lists will contain user ids user_info = pyTweet.user_lookup_userids(user_list=USERS[j], proxies=proxies, auth=auth) # Get friends, followers, and timelines of each user in user_info for k in range(len(user_info)): id = str(user_info[k]['id']) # Check to see that the user's friend/follower list hasn't already been collected if id in place_savers['finished_users'].keys(): # Load previously saved user data pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_' + str(place_savers['finished_users'][id]) + '.json') if os.path.getsize(pro_filename) == 0: # File exists but it is empty user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) else: try: # Open and read profile .json jfid = open(pro_filename) user_data = ujson.load(jfid) user_data['DOC'] = datetime.datetime.utcnow() jfid.close() except ValueError: # Fail at opening profile .json, resave it user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) else: # The user's profile has not been collected...start now place_savers['finished_users'][id] = str(uuid.uuid4()) pro_filename = os.path.join(save_dir['twitter_profiles'], 'userInfo_{}.json'.format(str(place_savers['finished_users'][id]))) # Add user information: hop, DOC user_data = user_info[k] user_data['khop'] = i user_data['DOC'] = datetime.datetime.utcnow() fast_save(filename=pro_filename, obj=user_data) print "\tSaved user {} information in {}.".format(id, pro_filename) # Collect user friends if 'friends_list' not in user_data: friends_list = [] if (user_data['friends_count'] > 0) and ((MAX_FRIENDS is None) or (MAX_FRIENDS > 0)): print "\tCollect friends for user {}.".format(id) friends_list = pyTweet.get_user_friends(user_id=id, limit=MAX_FRIENDS, proxies=proxies, auth=auth) user_data['friends_list'] = friends_list fast_save(filename=pro_filename, obj=user_data) place_savers['next_user_list'].difference_update(set(user_data['friends_list'][0:hop_out_limits['friends']])) # Add friends to next_user_list save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Collect user followers if 'followers_list' not in user_data: followers_list = [] if (user_data['followers_count'] > 0) and ((MAX_FOLLOWERS is None) or (MAX_FOLLOWERS > 0)): print "\tCollect followers for user {}.".format(id) followers_list = pyTweet.get_user_followers(user_id=id, limit=MAX_FOLLOWERS, proxies=proxies, auth=auth) user_data['followers_list'] = followers_list fast_save(filename=pro_filename, obj=user_data) place_savers['next_user_list'].difference_update(set(user_data['followers_list'][0:hop_out_limits['followers']])) # Add followers to next_user_list save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Collect timeline for user beginning from start_date tl_filename = os.path.join(save_dir['twitter_timelines'], 'timeline_{}.json'.format(place_savers['finished_users'][id])) if os.path.isfile(tl_filename): print "\tThe timeline for user {} has already been collected.".format(id) # Load timeline file if os.path.getsize(tl_filename) == 0: continue # Skip empty time lines try: jfid = open(tl_filename) tldata = ujson.load(jfid) jfid.close() except (IOError, ValueError): # Fail at opening file, recollect time line print "\tCollect the timeline for user {}.".format(id) tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow() fast_save(filename=tl_filename, obj=tldata) else: print "\tCollect the timeline for user {}.".format(id) tldata = pyTweet.collect_user_timeline(USER=id, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow() fast_save(filename=tl_filename, obj=tldata) # Pull out user mentions, if applicable if ('user_mention_id' in hop_out_limits) and ((hop_out_limits['user_mention_id'] > 0) or (hop_out_limits['user_mention_id'] is None)): print "\tAdd user mentions to the next hop" tl_mentions = pyTweet.pull_timeline_entitites(timeline=tldata, type='user_mention_id', limit=hop_out_limits['user_mention_id']) place_savers['next_user_list'].update(tl_mentions) save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Pull out user replies, if applicable if ('in_reply_to_user_id' in hop_out_limits) and ((hop_out_limits['in_reply_to_user_id'] > 0) or (hop_out_limits['in_reply_to_user_id'] is None)): print "\tAdd replies to the next hop" tl_replies = pyTweet.pull_timeline_entitites(timeline=tldata, type='in_reply_to_user_id', limit=hop_out_limits['in_reply_to_user_id']) place_savers['next_user_list'].update(tl_replies) save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) # Check data amount and quit if graph has reached limit if ('max_data' in hop_out_limits) and (hop_out_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_out_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format(data_vol, hop_out_limits['max_data']) return # Remove finished_users from place_savers['next_user_list'] place_savers['next_user_list'].difference_update(set(map(int, place_savers['finished_users'].keys()))) # Prepare for next iteration of hop place_savers['cur_user_list'] = place_savers['next_user_list'] place_savers['next_user_list'] = set([]) place_savers['cur_hop'] += 1 save_place_savers(user_dir=save_dir['twitter_profiles'], place_savers=place_savers) print "There are ", len(place_savers['cur_user_list']), " users in the next iteration of users." print "\nDone building graph!"
def breadth_first_search(user_seed, timeline_start_date, host, port, save_dir={}, hop_limits={}): """ This function creates a network based on Twitter friends :param user_seed: List of user names :param host: Your host IP :param port: Your port :param timeline_start_date: Beginning of date (datetime.date object) of timelines in collection :param save_dir: Set locations for the profile and timeline directory to save .JSONs. The default will be your current working directory. EX. save_dir = {'twitter_profiles': '/dir/to/save/profile/jsons', 'twitter_timelines': '/dir/to/save/timeline/jsons'} :param hop_limits: Specify your graph constrains with the variable hop_limits. First determine the maximum number of hops to make the graph with 'max_hops', then decide the maximum amount of data to collect in 'max_data'. This will be the combined profile and timeline .JSON files. Set it to 'None' if you don't want to limit the amount of data collected. Next, set limits (per individual) on how many friends, followers, replied to users, and mentioned users to include in the next hop. You can specify values [0, Inf) or None. Specifying 'None' implies that you do not wish to limit the collection, and will expand the graph on as many as these edges as possible. Occasionlly, you may get back fewer edges for a user than the limit you set. Note that friends and followers will be saved in the fields 'friends_list' and 'followers_list' automatically. The reply and mention users are saved in timelines. EX.hop_limits = {'max_hops': 2, # Maximin number of hops in graph 'max_data': None, # Maximum amount of data (in GB) 'friends': 0, # Maximum friends per user to include in next hop 'followers': None, # Maximum followers per user to include in next hop 'in_reply_to_user_id': 17, # Maximum 'in_reply_to_user_id' per user's timeline to include in next hop 'user_mention_id': 21} # Maximum 'user_mention_id' per user's timeline to include in next hop """ # CHECK PARAMETERS # Check save_dir dictionary fields, create directories if they do not already exist if ('twitter_profiles' not in save_dir.keys()) or (save_dir['twitter_profiles'].strip() == ''): save_dir['twitter_profiles'] = os.path.join(os.getcwd(), 'profiles') print "\tNo directory was specified for save_dir['twitter_profiles'] so it will be set to {}.".format( save_dir['twitter_profiles']) if not os.path.isdir(save_dir['twitter_profiles']): print "\tThe directory {} does not exist...creating it now".format( save_dir['twitter_profiles']) os.mkdir(save_dir['twitter_profiles']) if ('twitter_timelines' not in save_dir.keys()) or (save_dir['twitter_timelines'].strip() == ''): save_dir['twitter_timelines'] = os.path.join(os.getcwd(), 'timelines') print "\tNo directory was specified for save_dir['twitter_timelines'] so it will be set to {}.".format( save_dir['twitter_timelines']) if not os.path.isdir(save_dir['twitter_timelines']): print "\tThe directory {} does not exist...creating it now".format( save_dir['twitter_timelines']) os.mkdir(save_dir['twitter_timelines']) # Checkhop_limits dictionary hop_limits_defaults = { 'max_hops': [6, 'the maximin number of hops in graph'], 'max_data': [2, 'the maximin amount of data collected (in GB)'], 'friends': [0, 'max friends per user to include in next hop'], 'followers': [0, 'max followers per user to include in next hop'], 'in_reply_to_user_id': [ 0, "max 'in_reply_to_user_id' per user's timeline to include in next hop" ], 'user_mention_id': [0, "max 'user_mention_id' per user's timeline to include in next ho"] } for kk in hop_limits_defaults.keys(): if kk not in hop_limits: hop_limits[kk] = hop_limits_defaults[kk][0] print "\tNo Value was specified for hop_limits['{}'], {}, so it will be set to {}.".format( kk, hop_limits_defaults[kk][1], hop_limits_defaults[kk][0]) # Check data amount and quit if graph has reached limit if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits['max_data']) return # DETERMINE COLLECTION PARAMETERS # Load place_savers dictionary print "\nGetting information of current hop and finished users..." place_savers = load_place_savers(save_dir['twitter_profiles']) print "\tAs of now {} user profiles have been collected and saved to {}".format( len(place_savers['finished_users']), save_dir['twitter_profiles']) print "\tThe current hop is {}".format(place_savers['cur_hop']) if place_savers['cur_hop'] < 1: place_savers['cur_user_list'] = set(user_seed) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) print "\tWe will collect {} users in hop {}".format( len(place_savers['cur_user_list']), place_savers['cur_hop']) # Load growth parametes growth_params = load_growth_params(save_dir['twitter_profiles']) # Create proxies dictionary proxies = { 'http': 'http://%s:%s' % (host, port), 'https': 'http://%s:%s' % (host, port) } # Load twitter keys twitter_keys = pyTweet.load_twitter_api_key_set() # API authorization auth = pyTweet.get_authorization(twitter_keys) # BUILD THE GRAPH print "\nStart building the graph!" for khop in range(place_savers['cur_hop'], hop_limits['max_hops']): print "\nGet information for the {}th-hop users. There are {} total users in this hop.".format( khop, len(place_savers['cur_user_list'])) print "Create the user list of the {}th-hop users as well.".format( khop + 1) # Get profile information of users in cur_user_list print "\nCOLLECT PROFILE INFORMATION FOR THE CURRENT SET OF USERS" if khop < 1: # Find profiles to collect profiles_to_collect = set(place_savers['cur_user_list']) for json_filename in place_savers['finished_users'].values(): data = ujson.load( open( os.path.join(save_dir['twitter_profiles'], json_filename), 'r')) if data['screen_name'] in place_savers['cur_user_list']: profiles_to_collect.discard(data['screen_name']) # Collect and save profiles user_info = [] if len(profiles_to_collect) > 0: print "\nstart collecting profiles: {} profiles".format( len(profiles_to_collect)) user_info = pyTweet.user_lookup_usernames( user_list=list(profiles_to_collect), proxies=proxies, auth=auth) if isinstance(user_info, dict) and ('errors' in user_info.keys()): print "\nThe initial seed cannot be collected..." print "Twitter error message: ", user_info # Save profile information # print "user_info: ", user_info # print type(user_info) for udata in user_info: # print "udata: ", udata # print type(udata) json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][udata['id']] = json_filename # Convert screen names to user IDs in cur_user_list, identify unavailable accounts as well all_screennames = { } # Keys are screen names and values are file name jsons = filter(lambda k: re.match('userInfo_*', k), os.listdir(save_dir['twitter_profiles'])) for jj in jsons: try: full_filename = os.path.join(save_dir['twitter_profiles'], jj) if os.path.getsize(full_filename) != 0: jfid = open(full_filename) profile = ujson.load(jfid) jfid.close() all_screennames[profile['screen_name']] = jj except ValueError: continue # Get corresponding user IDs for each screen name in cur_user_list cur_user_list_ids = set([]) for scn_name in profiles_to_collect.union( place_savers['cur_user_list']): if scn_name in all_screennames.keys(): jfid = open( os.path.join(save_dir['twitter_profiles'], all_screennames[scn_name])) profile = ujson.load(jfid) jfid.close() if 'id' in profile: cur_user_list_ids.add(int(profile['id'])) else: place_savers['unavailable_accounts'].add(scn_name) else: place_savers['unavailable_accounts'].add(scn_name) print cur_user_list_ids del profiles_to_collect place_savers['cur_user_list'] = set(cur_user_list_ids) else: # Collect and save profiles profiles_to_collect = set( place_savers['cur_user_list']).difference( set(map(int, place_savers['finished_users'].keys()))) user_info = pyTweet.user_lookup_userids( user_list=list(profiles_to_collect), proxies=proxies, auth=auth) for udata in user_info: json_filename = _save_profile_json( profile_struct=udata, save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][udata['id']] = json_filename # Update current user list, and identify unavailable accounts new_cur_user_list = set([]) for uid in profiles_to_collect.union( set(place_savers['cur_user_list'])): if uid in place_savers['unavailable_accounts']: continue if uid in place_savers['finished_users'].keys(): new_cur_user_list.add(uid) else: place_savers['unavailable_accounts'].add(uid) place_savers['cur_user_list'] = set(new_cur_user_list) del new_cur_user_list # Save place saving variables growth_params['h{}_users.json'.format(khop)] = set( place_savers['cur_user_list']) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) # Get timelines for each user in user_info print "\nCOLLECT TIME LINES FOR CURRENT SET OF USERS" for uid in place_savers['cur_user_list']: if uid in place_savers['finished_users'].keys(): profile_filename = place_savers['finished_users'][uid] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) try: tldata = ujson.load(open(profile_filename, 'r')) tldata['id'] except (IOError, KeyError): # Collect user data user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][uid] = json_filename if ('has_timeline' in tldata.keys()) and (tldata['has_timeline'] is True): continue if not os.path.isfile(timeline_filename): print "Collect the timeline for user {}.".format(uid) tldata = pyTweet.collect_user_timeline( USER=uid, USER_type='user_id', start_date=timeline_start_date, proxies=proxies, auth=auth) for tl in range(len(tldata)): tldata[tl]['DOC'] = datetime.datetime.utcnow( ).strftime("%m-%d-%Y %H:%M:%S %z") tldata[tl]['has_timeline'] = True fast_save(filename=profile_filename, obj=tldata) print "\nGet friends of each user in cur_user_list" if hop_limits['friends'] != 0: growth_params["h{}_friends.json".format(khop)] = set([]) print "\nCOLLECT FRIENDS OF CURRENT USER SET" # print "place_savers['cur_user_list']: ", place_savers['cur_user_list'] for jj in place_savers['cur_user_list']: profile_filename = os.path.join( save_dir['twitter_profiles'], place_savers['finished_users'][jj]) try: data = ujson.load(open(profile_filename, 'r')) # print data['id'] except (IOError, KeyError, TypeError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) json_filename = _save_profile_json( profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) place_savers['finished_users'][uid] = json_filename if data['friends_count'] < 1: data['friends_list'] = [] fast_save(filename=profile_filename, obj=data) continue if 'friends_list' not in data.keys(): print "Collect friends for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits['friends'], proxies=proxies, auth=auth) data['friends_list'] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits['friends'] < len(data['friends_list']): place_savers['next_user_list'].update( set(data['friends_list'][0:len(hop_limits['friends']) - 1])) else: place_savers['next_user_list'].update( set(data['friends_list'])) growth_params["h{}_friends.json".format(khop)].update( set(data['friends_list'])) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) print "\nGet followers of each user in the cur_user_list" if hop_limits['followers'] != 0: growth_params["h{}_followers.json".format(khop)] = set([]) print "\nCOLLECT FOLLOWERS OF CURRENT USER SET" for jj in place_savers['cur_user_list']: # profile_filename = place_savers['finished_users'][jj] profile_filename = os.path.join( save_dir['twitter_profiles'], place_savers['finished_users'][jj]) try: data = ujson.load(open(profile_filename, 'r')) data['id'] except (IOError, KeyError): user_info = pyTweet.user_lookup_userids(user_list=[uid], proxies=proxies, auth=auth) if (user_info is not dict) or ('id' not in user_info.keys()): continue _save_profile_json(profile_struct=user_info[0], save_dir=save_dir['twitter_profiles'], khop=khop) if data['followers_count'] < 1: data['followers_list'] = [] fast_save(filename=profile_filename, obj=data) continue if 'followers_list' not in data.keys(): print "Collect followers for user {}".format(jj) friends_list = pyTweet.get_user_friends( user_id=jj, limit=hop_limits['followers'], proxies=proxies, auth=auth) data['followers_list'] = friends_list fast_save(filename=profile_filename, obj=data) if hop_limits['followers'] < len(data['followers_list']): place_savers['next_user_list'].update( set(data['followers_list'] [0:len(hop_limits['followers']) - 1])) else: place_savers['next_user_list'].update( set(data['followers_list'])) growth_params["h{}_followers.json".format(khop)].update( set(data['followers_list'])) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Pull out user mentions if ('user_mention_id' in hop_limits) and (hop_limits['user_mention_id'] != 0): print "\nCOLLECT USER MENTIONS OF CURRENT SET" growth_params["h{}_user_mentions.json".format(khop)] = set([]) for jj in place_savers['cur_user_list']: profile_filename = place_savers['finished_users'][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue tldata[0]['text'] except (IOError, KeyError): # Fix timeline file _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) tldata = ujson.load(open(timeline_filename, 'r')) else: # Get the timeline data _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue # Pull out user mentions tl_mentions = pyTweet.pull_timeline_entitites( timeline=tldata, type='user_mention_id', limit=hop_limits['user_mention_id']) growth_params["h{}_user_mentions.json".format(khop)].update( tl_mentions) place_savers['next_user_list'].update(tl_mentions) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Pull out replies if ('in_reply_to_user_id' in hop_limits) and (hop_limits['in_reply_to_user_id'] != 0): print "\nCOLLECT USERS CURRENT SET REPLIES TO" growth_params["h{}_replies.json".format(khop)] = set([]) for jj in place_savers['cur_user_list']: profile_filename = place_savers['finished_users'][jj] uuid_profile = os.path.basename(profile_filename)[9:-5] timeline_filename = os.path.join( save_dir['twitter_timelines'], "timeline_{}.json".format(uuid_profile)) # Load or create the timeline JSON file if os.path.isfile(timeline_filename): if os.path.getsize(timeline_filename) == 0: continue # Load the timeline data try: tldata = ujson.load(open(timeline_filename, 'r')) tldata[0]['text'] except (IOError, KeyError): # Fix timeline file _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue else: # Get the timeline data _save_timeline_json(user_id=jj, filename=timeline_filename, start_date=timeline_start_date, proxies=proxies, auth=auth) if os.path.getsize(timeline_filename) == 0: continue tldata = ujson.load(open(timeline_filename, 'r')) if len(tldata) < 1: continue # Pull out replies tl_replies = pyTweet.pull_timeline_entitites( timeline=tldata, type='in_reply_to_user_id', limit=hop_limits['in_reply_to_user_id']) place_savers['next_user_list'].update(tl_replies) growth_params["h{}_replies.json".format(khop)].update( tl_replies) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers) save_growth_params(user_dir=save_dir['twitter_profiles'], growth_obj=growth_params, cur_hop=khop) # Check data limit if ('max_data' in hop_limits) and (hop_limits['max_data'] is not None): data_vol = measure_data(user_dir=save_dir['twitter_profiles'], timeline_dir=save_dir['twitter_timelines']) if (data_vol > hop_limits['max_data']): print "The maximum amount of data has beek collected: {} GB with a limit of {} GB.".format( data_vol, hop_limits['max_data']) return # Prepare for next iteration place_savers['cur_hop'] = khop + 1 place_savers['cur_user_list'] = set(place_savers['next_user_list']) place_savers['next_user_list'] = set([]) save_place_savers(user_dir=save_dir['twitter_profiles'], place_saver_obj=place_savers)