def stream_user_tweets(): users = compile_users_n_others() api = authenticate_twitter() stream_listener = StreamListener(num_to_grab=-1, pickle=False) stream = tweepy.Stream(auth=api.auth, listener=stream_listener) for i, representative in enumerate(users): print(f"Streaming from user group {i} (representative user {representative})") user_set = users[representative] user_set.add(representative) def get_tweets(): stream.filter(follow=user_set, stall_warnings=True) p = multiprocessing.Process(target=get_tweets) p.start() p.join(150) p.terminate() p.join() if (i + 1) % 50 == 0: if not json_it(TWEET_DICT, TWEETS_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE) if not json_it(USER_DICT, USER_DICT_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE)
def main(): graph = build_graph(pickle=False, from_scratch=True) small_graph = trim_graph(graph, pickle=False, from_scratch=True) small_graph.name = "Twitter User Graph" print(f"full graph: {len(graph)} nodes") print(f"trim graph: {len(small_graph)} nodes") print("Generating JSON") json_it(small_graph, USER_GRAPH_FNAME, nx.node_link_data)
def trim_graph(graph, reduce_sample=True, pickle=True, from_scratch=True): if not graph and not from_scratch: graph = reload_json(USER_GRAPH_FNAME, transform=nx.node_link_graph) return graph rng_state = reload_object(RNG_FNAME, random.getstate) random.setstate(rng_state) print("Trimming graph...") significant_id_set = set() for direct in (Direct.IN, Direct.OUT): sample = [] ids = [] for user_id in graph: ids.append(user_id) num_neighb = direct.deg_view(graph)[user_id] sample.append(num_neighb) sample_mean = mean(sample) pop_stdev = stdev(sample) for i, degree in enumerate(sample): if abs(degree - sample_mean) > STDEV_MOD * pop_stdev: user_id = ids[i] significant_id_set.add((user_id, degree)) by_asc_degree = sorted(list(significant_id_set), key=lambda x: x[1]) significant_ids = [i[0] for i in by_asc_degree] to_subgraph = set() for user_id in significant_ids: try: others = set(graph.neighbors(user_id)) except KeyError: continue if reduce_sample and len(others) != 0: others = random.sample(others, int(len(others) * OTHERS_MOD)) if len(others) == 0: continue to_subgraph.add(user_id) for other in others: to_subgraph.add(other) pickle_it(rng_state, RNG_FNAME) user_graph = graph.subgraph(to_subgraph) if pickle: json_it(user_graph, USER_GRAPH_FNAME, nx.node_link_data) return user_graph
def construct_graph_data(): global GRAPH_DATA GRAPH_DATA = reload_json("graph_data", lambda: None) if GRAPH_DATA: return GRAPH_DATA = {} GRAPH_DATA["raw_tweets"] = run_tweets() for name, d_source in DataSource.__members__.items(): sizes, colors = run_data(d_source) GRAPH_DATA[str(d_source)] = (sizes, colors) x_range, y_range = get_square_bounds() GRAPH_DATA["range"] = (x_range, y_range) json_it(GRAPH_DATA, "graph_data")
def expand_user_list(user_id, api_obj, count_key): """ Given a user, a Twitter api object, and a dictionary key: Scrape twitter for that users friends/followers (depending on the key) and add this information back into the user dictionary. """ now = dt.datetime.now() print( "(%s) (id=%s) Expanding %s" % (now.strftime("%a, %b %d %I:%M %p"), user_id, count_key) ) user_cursor = tweepy.Cursor(api_obj, user_id=user_id) pages = [] try: pages = list(user_cursor.pages()) except tweepy.error.TweepError: print("This user has protected tweets, skipping") return for user_id_page in pages: users_on_page = [] for other_user_id in user_id_page: users_on_page.append(other_user_id) USER_DICT[user_id][count_key] += users_on_page if not json_it(USER_DICT, USER_DICT_FNAME): sys.stderr.write(f"failed to pickle after processing user {user_id}")
def on_status(self, tweet): user_id = tweet.user.id_str tweet_json = tweet._json if user_id not in TWEET_DICT: self.new_users += 1 init_len = 0 try: init_len = len(TWEET_DICT[user_id]) TWEET_DICT[user_id].append(tweet_json) except KeyError: TWEET_DICT[user_id] = [tweet_json] if user_id not in USER_DICT: USER_DICT[user_id] = tweet.user._json USER_DICT[user_id]["followers"] = [] USER_DICT[user_id]["friends"] = [] # update tweet num after_len = len(TWEET_DICT[user_id]) if init_len < after_len: self.new_tweets += 1 if self.num_to_grab > 0 and self.new_tweets >= self.num_to_grab: if self.pickle and not json_it(TWEET_DICT, TWEETS_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE) if self.pickle and not json_it(USER_DICT, USER_DICT_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE) self.reset_state() return False if self.new_tweets % 100 == 0: print(f"currently scraped {self.new_tweets} new tweets")
p.join() if (i + 1) % 50 == 0: if not json_it(TWEET_DICT, TWEETS_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE) if not json_it(USER_DICT, USER_DICT_FNAME): sys.stderr.write(f"ERROR: Failed final pickling, abort!\n") sys.exit(FILE_NOT_FOUND_EXIT_CODE) def main(): global GRAB_NEW api = authenticate_twitter() if GRAB_NEW: stream_listener = StreamListener() stream = tweepy.Stream(auth=api.auth, listener=stream_listener) stream.filter(track=KEYWORDS, stall_warnings=True) expand_neighbors(api) stream_user_tweets() if __name__ == "__main__": try: main() except KeyboardInterrupt: print("Recieved siginterrupt, jsoning objects and exiting") json_it(TWEET_DICT, TWEETS_FNAME) json_it(USER_DICT, USER_DICT_FNAME) sys.exit(1)