def main(topology): # the input to main is the path to the topology file # the output to this script saves two json files inside the downloaded tweets directory, # one json file has all the active users the other has all inactive users from the topology # user activity is based on status count and availabilty of tweets (public vs private) # # this script can be stopped and started in the middle of running it without losing progress inactive_users = read_json('dnld_tweets/inactive_users.json') active_users = read_json('dnld_tweets/active_users.json') twpy_api = auth.get_access_creds() tweets_dir = './dnld_tweets/' # put every single user (non repeating) from the topology file into a set with open(topology, 'r') as inp_file: comm_set = set(user for community in inp_file for user in ast.literal_eval(community)) # create directory for storing tweets if not os.path.exists(os.path.dirname(tweets_dir)): os.makedirs(os.path.dirname(tweets_dir), 0o755) # download tweets for every single user in the set # separate active users from inactive users based on status count and availability bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') while comm_set: user = comm_set.pop() bar.update(item_id=str(user) + '\t') if str(user) in inactive_users or str(user) in active_users: continue # skip user if they don't exist or are inactive status_count = user_status_count(user, twpy_api) if status_count <= 10: inactive_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue # skip user if already downloaded their tweets if os.path.exists(os.path.join(tweets_dir, str(user))): active_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue tweets = get_tweets(user, twpy_api) if tweets: tweet_filename = tweets_dir + str(user) write_tweets(tweets, tweet_filename) active_users[str(user)] = status_count else: inactive_users[str(user)] = 0 write_json(tweets_dir, active_users, inactive_users)
def main(topology): inactive_users = read_json('dnld_tweets/inactive_users.json') active_users = read_json('dnld_tweets/active_users.json') _, app_auths = auth.get_access_creds() tweets_dir = './dnld_tweets/' with open(topology, 'r') as inp_file: comm_set = set(user for community in inp_file for user in ast.literal_eval(community)) if not os.path.exists(os.path.dirname(tweets_dir)): os.makedirs(os.path.dirname(tweets_dir), 0o755) bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') while comm_set: user = comm_set.pop() bar.update(item_id=user) if str(user) in inactive_users: continue api = auth.manage_auth_handlers(app_auths) # skip user if they don't exist or are inactive status_count = user_status_count(user, api) if status_count <= 10: inactive_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue # skip user if you've already downloaded their tweets if os.path.exists(os.path.join(tweets_dir, str(user))): active_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue tweets = get_tweets(user, api) if tweets: tweet_filename = tweets_dir + str(user) write_tweets(tweets, tweet_filename) active_users[str(user)] = status_count else: inactive_users[str(user)] = 0 write_json(tweets_dir, active_users, inactive_users)
def main(): search_dir = 'twitter_geo_searches/' if not os.path.exists(os.path.dirname(search_dir)): os.makedirs(os.path.dirname(search_dir), 0o755) twpy_api = auth.get_access_creds() pool = multiprocessing.Pool(max(1, multiprocessing.cpu_count() - 1)) # set up the command line arguments parser = argparse.ArgumentParser( description= 'Get twitter user ids and their follower ids from Tweepy and save in different formats' ) subparsers = parser.add_subparsers(dest='mode') search_parser = subparsers.add_parser( 'search', help='Gather Twitter user ids and followers by city, state and radius') search_parser.add_argument( '-c', '--city', required=True, action='store', dest='city', help='City to search for Twitter user ids. REQUIRED') search_parser.add_argument( '-s', '--state', required=True, action='store', dest='state', help='State to search for Twitter user ids. REQUIRED') search_parser.add_argument( '-r', '--radius', required=True, action='store', dest='radius', help= 'Radius to search Twitter API for user ids (miles or kilometers -- ex: 50mi or 50km). REQUIRED' ) search_parser.add_argument( '-f', '--filename', required=True, action='store', dest='filename', help='Name of output file for networkx graph data. REQUIRED') netx_parser = subparsers.add_parser( 'netx', help='Perform operations on already generated networkx graph') netx_parser.add_argument('-q', '--clique', action='store_true', help='Find cliques with networkx') netx_parser.add_argument( '-x', '--clq_filename', action='store', help='Provide a filename for the serialized output of find_cliques') netx_parser.add_argument('-g', '--graph_filename', required=True, action='store', dest='graph_filename', help='Networkx input data filename. REQUIRED') netx_parser.add_argument('-o', '--out_filename', required=True, action='store', dest='out_filename', help='Networkx output data filename REQUIRED') netx_parser.add_argument('-k', '--comm', action='store_true', help='Find communities with networkx') netx_parser.add_argument('-p', '--print_graph', action='store_true', help='Print networkx graph') argcomplete.autocomplete(parser) args = parser.parse_args() if not args.mode: print('ERROR: No arguments provided. Use -h or --help for help') return if args.mode == 'search': city = args.city state = args.state search_radius = args.radius search_filename = args.filename + '.json' # gets the first 50 zip codes by city and state zip_search = SearchEngine() zipcodes = zip_search.by_city_and_state(city, state, returns=50) user_ids = [] user_followers = [] # gets the user ids at each geo-location for the retrieved zip codes bar = pyprind.ProgPercent(len(zipcodes), track_time=True, title='Finding user ids') for zipcode in zipcodes: bar.update(item_id=str(zipcode.zipcode) + '\t') latitude = zipcode.lat longitude = zipcode.lng user_ids.extend( get_user_ids(twpy_api, latitude, longitude, search_radius)) n = 2 # gets the followers of all the retrieved user ids n number of depths for i in range(0, n): user_ids, user_followers = get_user_followers( twpy_api, set(user_ids)) filename = os.path.join(search_dir, search_filename) save_user_follower_networkx_graph(user_followers, filename) if args.mode == 'netx': graph_filename = os.path.join(search_dir, args.graph_filename + '.json') output_filename = os.path.join(search_dir, args.out_filename + '.json') graph = open_nx_graph(graph_filename) cliques = [] if args.clique: for clique in pool.map(gather_cliques, nx.find_cliques(graph)): cliques.append([int(member) for member in clique]) with open(output_filename, 'w') as output: for clique in cliques: output.write('%s,\n' % (clique)) elif args.comm: if args.clq_filename: clique_filename = os.path.join(search_dir, args.clq_filename + '.json') # load the clique topology file with open(clique_filename, 'r') as find_cliques_file: cliques = [ clique for cliques in find_cliques_file for clique in ast.literal_eval(cliques) ] with open(output_filename, "w") as output: for node in pool.map(gather_cliques, community.girvan_newman(graph)): print(node) #output.write(str([int(item) for item in node]) + ', \n') elif args.print_graph: nx.draw(graph) plt.show() print("Job complete")
def main(): # set up the command line arguments parser = argparse.ArgumentParser( description= 'Get twitter user ids and their follower ids using Tweepy and save in different formats' ) subparsers = parser.add_subparsers(dest='mode') search_parser = subparsers.add_parser( 'search', help='Gather Twitter user ids by city, state and radius') search_parser.add_argument( '-c', '--city', required=True, action='store', dest='city', help='City to search for Twitter user ids. REQUIRED') search_parser.add_argument( '-s', '--state', required=True, action='store', dest='state', help='State to search for Twitter user ids. REQUIRED') search_parser.add_argument( '-r', '--radius', required=True, action='store', dest='radius', help= 'Radius to search Twitter API for user ids (miles or kilometers -- ex: 50mi or 50km). REQUIRED' ) search_parser.add_argument( '-d', '--depth', required=True, action='store', dest='depth', help= 'This value represents how far to traverse into user follower relationships when gathering users. REQUIRED' ) search_parser.add_argument( '-f', '--filename', required=True, action='store', dest='filename', help='Name of output file to store gathered users in. REQUIRED') search_parser.add_argument( '-z', '--creds', required=True, action='store', dest='creds', help='Path to Twitter developer access credentials REQUIRED') continue_parser = subparsers.add_parser( 'getfws', help= 'Takes in already gathered jsonified list of users and retrieves their followers' ) continue_parser.add_argument( '-f', '--filename', action='store', help= 'Filename of the previously saved Twitter users ids in .json format') continue_parser.add_argument( '-d', '--depth', required=True, action='store', dest='depth', help= 'This value represents how far to traverse into user follower relationships when searching for followers. REQUIRED' ) continue_parser.add_argument( '-z', '--creds', required=True, action='store', dest='creds', help='Path to Twitter developer access credentials REQUIRED') convert_parser = subparsers.add_parser( 'convert', help= 'Convert user followers dict to users list and save file. This is the file format used when continuing the get followers function and in get_community_tweets.py' ) convert_parser.add_argument( '-i', '--input_file', action='store', help='Filename of the previously saved followers dictionary') convert_parser.add_argument( '-o', '--out_file', action='store', help= 'Filename to store the output. Just the filename no path is needed. The output file will be saved in the folder of the input file' ) netx_parser = subparsers.add_parser( 'netx', help='Create cliques or communities from user follower data') group = netx_parser.add_mutually_exclusive_group(required=True) group.add_argument('-q', '--gen_cliques', required=False, action='store_true', dest='gen_cliques', help='Generate cliques from user followers dictionary') group.add_argument( '-c', '--gen_comms', required=False, action='store_true', dest='gen_comms', help='Generate communities from user followers dictionary') netx_parser.add_argument( '-n', '--min_size', action='store', dest='min_size', nargs='?', type=int, const=1, default=4, help='Constraint for min size of clique or community (default is 4)') netx_parser.add_argument('-i', '--in_filename', required=True, action='store', dest='in_filename', help='User followers dictionary file REQUIRED') netx_parser.add_argument('-o', '--out_filename', required=True, action='store', dest='out_filename', help='Output topology filename REQUIRED') argcomplete.autocomplete(parser) args = parser.parse_args() if args.mode == 'convert': working_dir = get_directory_of_file(args.input_file) convert_followers_to_users(args.input_file, args.out_file, working_dir) if args.mode == 'getfws': twpy_api = auth.get_access_creds(args.creds) if not twpy_api: print('Error: Twitter developer access credentials denied') return working_dir = get_directory_of_file(args.filename) user_ids = read_json(args.filename) if not user_ids: print('Error: No users found in provided file') return # gets the followers of all the retrieved user ids 'depth' number of times collect_user_followers(args.depth, twpy_api, working_dir, args.filename, user_ids) if args.mode == 'search': twpy_api = auth.get_access_creds(args.creds) if not twpy_api: print('Error: Twitter developer access credentials denied') return working_dir = get_directory_of_file(args.filename) # gets the first 50 zip codes by city and state zip_search = SearchEngine() zipcodes = zip_search.by_city_and_state(args.city, args.state, returns=50) user_ids = [] user_followers = [] # gets the user ids at each geo-location for the retrieved zip codes bar = pyprind.ProgPercent(len(zipcodes), track_time=True, title='Finding user ids') for zipcode in zipcodes: bar.update(item_id='zip code:' + str(zipcode.zipcode) + '\t') user_ids.extend( get_user_ids(twpy_api, zipcode.lat, zipcode.lng, args.radius)) write_json(args.filename, list(set(user_ids))) if args.mode == 'netx': user_followers = read_json(args.in_filename) pythonify_dict(user_followers) print("Number of followers: " + str(len(user_followers))) output_filename = args.out_filename + '.json' graph = build_netx_graph(user_followers) if args.gen_cliques: generate_cliques(graph, output_filename, args.min_size) if args.gen_comms: generate_communities(graph, output_filename, args.min_size)
def main(): # the output to this script saves two json files inside the downloaded tweets directory, # one json file has all the active users the other has all inactive users from the topology # user activity is based on status count and availabilty of tweets (public vs private) # script can be stopped and started in the middle of running it without losing progress parser = argparse.ArgumentParser( description= 'Get tweets of all twitter user ids in the provided topology file') parser.add_argument('-f', '--users_file', required=True, action='store', dest='users_file', help='Location of file with user ids') parser.add_argument( '-c', '--dev_creds', required=True, action='store', dest='dev_creds', help='Location of file containing Twitter developer access credentials' ) parser.add_argument( '-o', '--output_dir', required=True, action='store', dest='output_dir', help='Name of the directory you want to download Tweets to') parser.add_argument( '-n', '--num_tweets', action='store', dest='num_tweets', nargs='?', type=int, const=1, default=3200, help='Number of tweets to download from user (default is 3200)') argcomplete.autocomplete(parser) args = parser.parse_args() tweets_dir = args.output_dir # create directory for storing tweets if not os.path.exists(os.path.dirname(tweets_dir)): os.makedirs(os.path.dirname(tweets_dir), 0o755) inactive_users = read_json(os.path.join(tweets_dir, 'inactive_users.json')) active_users = read_json(os.path.join(tweets_dir, 'active_users.json')) twpy_api = auth.get_access_creds(args.dev_creds) if not twpy_api: print('Error: Twitter developer access credentials denied') return # open the lists of user ids. this file should already be a non-repeating set comm_set = set(read_json(args.users_file)) # download tweets for every single user in the set # separate active users from inactive users based on status count and availability bar = pyprind.ProgPercent(len(comm_set), track_time=True, title='Downloading Tweets') while comm_set: user = comm_set.pop() bar.update(item_id=str(user) + '\t') if str(user) in inactive_users or str(user) in active_users: continue # skip user if they don't exist or are inactive status_count = user_status_count(user, twpy_api) if status_count <= 10: inactive_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue # skip user if already downloaded their tweets if os.path.exists(os.path.join(tweets_dir, str(user))): active_users[str(user)] = status_count write_json(tweets_dir, active_users, inactive_users) continue tweets = get_tweets(user, twpy_api, args.num_tweets) if tweets: tweet_filename = os.path.join(tweets_dir, str(user)) write_tweets(tweets, tweet_filename) active_users[str(user)] = status_count else: inactive_users[str(user)] = 0 write_json(tweets_dir, active_users, inactive_users)