def perform(self): self.mgr.load_model_state() print("----------------") print(f"FETCHING TEXTS...") print(f"SCORING TEXTS IN BATCHES...") batch = [] counter = 0 for row in self.fetch_texts(): batch.append(row) if len(batch) >= self.batch_size: counter += len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = [] # process final (potentially incomplete) batch if any(batch): counter += len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = []
def perform(self): self.write_metadata_to_file() self.upload_metadata() self.start() self.graph = DiGraph() self.running_results = [] users = list(self.bq_service.fetch_random_users(limit=self.users_limit, topic=self.topic, start_at=self.convo_start_at, end_at=self.convo_end_at)) print("FETCHED", len(users), "USERS") screen_names = sorted([row["user_screen_name"] for row in users]) for row in self.bq_service.fetch_specific_user_friends(screen_names=screen_names): self.counter += 1 if not self.dry_run: self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]]) if self.counter % self.batch_size == 0: rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)} print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr) self.end() self.report() self.write_results_to_file() self.upload_results() self.write_graph_to_file() self.upload_graph()
def perform(self): self.start() self.write_metadata_to_file() self.upload_metadata() print(logstamp(), "CONSTRUCTING GRAPH OBJECT...") self.graph = DiGraph() self.running_results = [] self.cursor.execute(self.sql) while True: batch = self.cursor.fetchmany(size=self.batch_size) if not batch: break self.counter += len(batch) if not self.dry_run: for row in batch: self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]]) rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)} print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr) self.cursor.close() self.connection.close() print(logstamp(), "GRAPH CONSTRUCTED!") self.report() self.write_results_to_file() self.upload_results() self.write_graph_to_file() self.upload_graph() self.end()
def running_results(self): rr = {"ts": logstamp(), "counter": self.counter, "nodes": self.graph.number_of_nodes(), "edges": self.graph.number_of_edges() } print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) return rr
def similarity_graph_report(self): if not self.similarity_graph: self.similarity_graph = self.load_similarity_graph() print("-------------------") print("SIMILARITY GRAPH", type(self.similarity_graph)) print(" NODES:", fmt_n(self.similarity_graph.number_of_nodes())) print(" EDGES:", fmt_n(self.similarity_graph.number_of_edges())) print("-------------------")
def report(self): if not self.graph: self.graph = self.load_graph() print("-------------------") print(type(self.graph)) print(" NODES:", fmt_n(self.node_count)) print(" EDGES:", fmt_n(self.edge_count)) print("-------------------")
def report(self, graph): """ Params: graph (DiGraph) """ print("-------------------") print(type(graph)) print(" NODES:", fmt_n(graph.number_of_nodes())) print(" EDGES:", fmt_n(graph.number_of_edges())) print("-------------------")
def perform(self): """ Given: bot_ids (list) a unique list of bot ids, which should all be included as nodes in the bot retweet graph. The retweet graph will also contain retweeted users. So that's why we need a separate list. The bot ids will be used as nodes in the similarity graph. bot_retweet_graph (networkx.DiGraph) a retweet graph generated from the bot list Returns: a similarity graph (networkx.Graph), where the similarity is based on the Jaccard index. For each pair of bots we calculate the Jaccard index based on the sets of people they retweet. If two bots retweet exactly the same users, their Jaccard index is one. If they don't retweet anyone in common, their Jaccard index is zero. """ grapher.retweet_graph_report() bot_ids = [ row.user_id for row in self.bq_service.fetch_bot_ids(bot_min=self.bot_min) ] print("FETCHED", fmt_n(len(bot_ids)), "BOT IDS") node_pairs = [] for i, bot_id in enumerate(bot_ids): for other_bot_id in bot_ids[i + 1:]: if self.retweet_graph.has_node( other_bot_id) and self.retweet_graph.has_node(bot_id): node_pairs.append((bot_id, other_bot_id)) # could maybe just take the combinations between all nodes in the bot graph # because we can assume they were assembled using the same bot ids as the ones here # but the point is to be methodologically sound and it doesn't take that long print("NODE PAIRS:", fmt_n(len(node_pairs))) results = jaccard_coefficient(self.retweet_graph.to_undirected(), node_pairs) #> returns an iterator of 3-tuples in the form (u, v, p) #> where (u, v) is a pair of nodes and p is their Jaccard coefficient. print("JACCARD COEFFICIENTS BETWEEN EACH NODE PAIR - COMPLETE!" ) #, fmt_n(len(list(results)))) print("CONSTRUCTING SIMILARITY GRAPH...") self.similarity_graph = Graph() edge_count = 0 #positive_results = [r for r in results if r[2] > 0] # this takes a while, maybe let's just stick with the original iterator approach for bot_id, other_bot_id, similarity_score in results: if similarity_score > 0: self.similarity_graph.add_edge(bot_id, other_bot_id, weight=similarity_score) edge_count += 1 self.counter += 1 if self.counter % self.batch_size == 0: print(logstamp(), "|", fmt_n(self.counter), "|", fmt_n(edge_count), "EDGES")
def __init__(self, model_name=MODEL_NAME, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None): self.model_name = model_name.lower().replace(";","") # using this model name in queries, so be super safe about SQL injection, although its not a concern right now self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() print("----------------") print("TOXICITY SCORER...") print(" MODEL:", self.model_name.upper()) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size))
def compile_energy_graph(self): print("COMPILING ENERGY GRAPH...") self.energy_graph, self.bot_ids, self.user_data = compute_energy_graph( self.rt_graph, self.prior_probabilities, self.link_energies, self.out_degrees, self.in_degrees) #self.human_names = list(set(self.rt_graph.nodes()) - set(self.bot_ids)) print("-----------------") print("ENERGY GRAPH:", type(self.energy_graph)) print("NODE COUNT:", fmt_n(self.energy_graph.number_of_nodes())) print( f"BOT COUNT: {fmt_n(len(self.bot_ids))} ({fmt_pct(len(self.bot_ids) / self.energy_graph.number_of_nodes())})" ) print("USER DATA:", fmt_n(len(self.user_data.keys())))
def alpha(self): """Params for the link_energy function""" in_degrees_list = [v for _, v in self.in_degrees] out_degrees_list = [v for _, v in self.out_degrees] print("MAX IN:", fmt_n(max(in_degrees_list))) #> 76,617 print("MAX OUT:", fmt_n(max(out_degrees_list))) #> 5,608 alpha_in = np.quantile(in_degrees_list, self.alpha_percentile) alpha_out = np.quantile(out_degrees_list, self.alpha_percentile) print("ALPHA IN:", fmt_n(alpha_in)) #> 2,252 print("ALPHA OUT:", fmt_n(alpha_out)) #> 1,339 return [self.mu, alpha_out, alpha_in]
def memory_report(self): if not self.graph: self.graph = self.load_graph() #memory_load = memory_usage(self.read_graph_from_file, interval=.2, timeout=1) file_size = os.path.getsize(self.local_graph_filepath) # in bytes print("-------------------") print(type(self.graph)) print(" NODES:", fmt_n(self.node_count)) print(" EDGES:", fmt_n(self.edge_count)) print(" FILE SIZE:", fmt_n(file_size)) print("-------------------") return {"nodes": self.node_count, "edges": self.edge_count, "file_size": file_size}
def perform(self): self.graph = DiGraph() self.running_results = [] for row in self.bq_service.fetch_user_friends_in_batches(): self.counter += 1 if not self.dry_run: self.graph.add_edges_from([(row["screen_name"], friend) for friend in row["friend_names"]]) if self.counter % self.batch_size == 0: rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)} print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr)
def download_user_friends(self): self.start_at = time.perf_counter() self.batch = [] self.counter = 0 if self.pg_destructive and UserFriend.__table__.exists(): print("DROPPING THE USER FRIENDS TABLE!") UserFriend.__table__.drop(self.pg_engine) self.pg_session.commit() if not UserFriend.__table__.exists(): print("CREATING THE USER FRIENDS TABLE!") UserFriend.__table__.create(self.pg_engine) self.pg_session.commit() print(logstamp(), "DATA FLOWING...") for row in self.bq_service.fetch_user_friends_in_batches( limit=self.users_limit): self.batch.append({ "user_id": row["user_id"], "screen_name": row["screen_name"], "friend_count": row["friend_count"], "friend_names": row["friend_names"] }) self.counter += 1 if len(self.batch) >= self.batch_size: print(logstamp(), fmt_n(self.counter), "SAVING BATCH...") self.pg_session.bulk_insert_mappings(UserFriend, self.batch) self.pg_session.commit() self.batch = [] print("ETL COMPLETE!") self.end_at = time.perf_counter() self.pg_session.close()
def download_retweeter_details(self): self.start_job() self.destructively_migrate(RetweeterDetail) print(logstamp(), "DATA FLOWING...") for row in self.bq_service.fetch_retweeter_details_in_batches( limit=self.users_limit): item = { "user_id": row['user_id'], "verified": row["verified"], "created_at": row["created_at"], "screen_name_count": row["screen_name_count"], "name_count": row["name_count"], "retweet_count": row["retweet_count"], } self.batch.append(item) self.counter += 1 # temporarily testing individual inserts... #record = RetweeterDetail(**item) #self.pg_session.add(record) #self.pg_session.commit() if len(self.batch) >= self.batch_size: print(logstamp(), fmt_n(self.counter), "SAVING BATCH...") self.pg_session.bulk_insert_mappings(RetweeterDetail, self.batch) self.pg_session.commit() self.batch = [] self.end_job()
def bot_probabilities_df(self): df = DataFrame(list(self.bot_probabilities.items()), columns=["user_id", "bot_probability"]) df.index.name = "row_id" df.index = df.index + 1 print("--------------------------") print("CLASSIFICATION COMPLETE!") print(df.head()) print("... < 50% (NOT BOTS):", fmt_n(len(df[df["bot_probability"] < 0.5]))) print("... = 50% (NOT BOTS):", fmt_n(len(df[df["bot_probability"] == 0.5]))) print("... > 50% (MAYBE BOTS):", fmt_n(len(df[df["bot_probability"] > 0.5]))) print("... > 90% (LIKELY BOTS):", fmt_n(len(df[df["bot_probability"] > 0.9]))) return df
def perform(self): self.start() self.write_metadata_to_file() self.upload_metadata() self.edges = [] self.running_results = [] for row in self.bq_service.fetch_user_friends_in_batches( limit=self.users_limit): self.counter += 1 if not self.dry_run: self.edges += [(row["screen_name"], friend) for friend in row["friend_names"]] if self.counter % self.batch_size == 0: rr = { "ts": logstamp(), "counter": self.counter, "edges": len(self.edges) } print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr) self.write_results_to_file() self.upload_results() self.write_edges_to_file() self.upload_edges() print(logstamp(), "CONSTRUCTING GRAPH OBJECT...") self.graph = DiGraph(self.edges) print(logstamp(), "GRAPH CONSTRUCTED!") self.report() del self.running_results # remove in hopes of freeing up some memory del self.edges # remove in hopes of freeing up some memory self.write_graph_to_file() #del self.graph # remove in hopes of freeing up some memory self.upload_graph() self.end()
def download_retweeter_details(self): self.start_at = time.perf_counter() self.batch = [] self.counter = 0 if self.pg_destructive and RetweeterDetail.__table__.exists(): print("DROPPING THE RETWEETER DETAILS TABLE!") RetweeterDetail.__table__.drop(self.pg_engine) self.pg_session.commit() if not RetweeterDetail.__table__.exists(): print("CREATING THE RETWEETER DETAILS TABLE!") RetweeterDetail.__table__.create(self.pg_engine) self.pg_session.commit() print(logstamp(), "DATA FLOWING LIKE WATER...") for row in self.bq_service.fetch_retweeter_details_in_batches( limit=self.users_limit): item = { "user_id": row['user_id'], "verified": row["verified"], "created_at": row["created_at"], "screen_name_count": row["screen_name_count"], "name_count": row["name_count"], "retweet_count": row["retweet_count"], # # todo: these topics are specific to the impeachment dataset, so will need to generalize if/when working with another topic (leave for future concern) # "ig_report": row["ig_report"], # "ig_hearing": row["ig_hearing"], # "senate_hearing": row["senate_hearing"], # "not_above_the_law": row["not_above_the_law"], # "impeach_and_convict": row["impeach_and_convict"], # "impeach_and_remove": row["impeach_and_remove"], # "facts_matter": row["facts_matter"], # "sham_trial": row["sham_trial"], # "maga": row["maga"], # "acquitted_forever": row["acquitted_forever"], # "country_over_party": row["country_over_party"], } self.batch.append(item) self.counter += 1 # temporarily testing individual inserts... #record = RetweeterDetail(**item) #self.pg_session.add(record) #self.pg_session.commit() if len(self.batch) >= self.batch_size: print(logstamp(), fmt_n(self.counter), "SAVING BATCH...") self.pg_session.bulk_insert_mappings(RetweeterDetail, self.batch) self.pg_session.commit() self.batch = [] print("ETL COMPLETE!") self.end_at = time.perf_counter() self.pg_session.close()
def load_data(csv_filepath): if os.path.isfile(csv_filepath) and not DESTRUCTIVE: print("LOADING CSV...") df = read_csv(csv_filepath) else: print("DOWNLOADING CSV...") df = download_data() df.to_csv(csv_filepath, index=False) print(fmt_n(len(df))) print(df.head()) return df
def classify_bot_probabilities(rt_graph, weight_attr="rt_count"): """ Given a retweet graph, computes bot probabilities, in a single function! Params: rt_graph (networkx.DiGraph) representing a retweet graph, with weights stored in the weight_attr param weight_attr (str) the attribute in the edge data where the weights are. in the rt graph, this represents number of times user a has retweeted user b """ in_degrees = dict( rt_graph.in_degree(weight=weight_attr)) # users receiving retweets out_degrees = dict( rt_graph.out_degree(weight=weight_attr)) # users doing the retweeting print("IN-DEGREES:", fmt_n(len(in_degrees))) print("OUT-DEGREES:", fmt_n(len(out_degrees))) links = parse_bidirectional_links(rt_graph) energies = [(link[0], link[1], compute_link_energy(link[0], link[1], link[4], in_degrees, out_degrees)) for link in links] print("ENERGIES:", fmt_n(len(energies))) positive_energies = [e for e in energies if sum(e[2]) > 0] print("POSITIVE ENERGIES:", fmt_n(len(positive_energies))) prior_probabilities = dict.fromkeys(list(rt_graph.nodes), 0.5) energy_graph, pl, user_data = compile_energy_graph(rt_graph, prior_probabilities, positive_energies, out_degrees, in_degrees) print("ENERGIES GRAPHED...") # this is the step that takes the longest bot_probabilities = dict.fromkeys(list( user_data.keys()), 0) # start with defaults of 0 for each user for user in pl: user_data[user]["clustering"] = 1 bot_probabilities[user] = 1 return bot_probabilities, user_data
def __init__(self, limit=LIMIT, batch_size=BATCH_SIZE, bq_service=None, model_manager=None): self.limit = limit self.batch_size = batch_size self.bq_service = bq_service or BigQueryService() self.mgr = model_manager or ModelManager() print("----------------") print("TOXICITY SCORER...") print(" MODEL CHECKPOINT:", self.mgr.checkpoint_name.upper(), self.mgr.checkpoint_url) print(" SCORES TABLE NAME:", self.scores_table_name) print(" LIMIT:", fmt_n(self.limit)) print(" BATCH SIZE:", fmt_n(self.batch_size)) self.predict = self.mgr.predict_scores # method alias seek_confirmation()
def perform(self): self.edges = [] self.running_results = [] self.start() self.cursor.execute(self.sql) while True: batch = self.cursor.fetchmany(size=self.batch_size) if not batch: break self.counter += len(batch) if not self.dry_run: for row in batch: self.edges += [(row["screen_name"], friend) for friend in row["friend_names"]] rr = { "ts": logstamp(), "counter": self.counter, "edges": len(self.edges) } print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr) self.write_results_to_file() self.upload_results() self.write_edges_to_file() self.upload_edges() print(logstamp(), "CONSTRUCTING GRAPH OBJECT...") self.graph = DiGraph(self.edges) print(logstamp(), "GRAPH CONSTRUCTED!") del self.edges # try to free up some memory maybe, before writing to file self.report() self.write_graph_to_file() self.upload_graph() self.end()
def perform(self): self.graph = DiGraph() print("FETCHING BOT FOLLOWERS...") for row in self.bq_service.fetch_bot_follower_lists( bot_min=self.bot_min): bot_id = row["bot_id"] self.graph.add_edges_from([(follower_id, bot_id) for follower_id in row["follower_ids"]]) self.counter += 1 if self.counter % self.batch_size == 0: print(" ", logstamp(), "| BOTS:", fmt_n(self.counter))
def load_retweets(self): """ Loads or downloads bot community tweets to/from CSV. """ if os.path.isfile(self.retweets_filepath): print("READING BOT COMMUNITY RETWEETS FROM CSV...") self.retweets_df = read_csv( self.retweets_filepath ) # DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False #if ROWS_LIMIT: # self.retweets_df = read_csv(local_csv_filepath, nrows=int(ROWS_LIMIT)) #else: # self.retweets_df = read_csv(local_csv_filepath) else: print("DOWNLOADING BOT COMMUNITY RETWEETS...") counter = 0 records = [] for row in self.bq_service.download_n_bot_community_retweets_in_batches( self.n_clusters): records.append({ "community_id": row.community_id, "user_id": row.user_id, "user_screen_name_count": row.user_screen_name_count, "user_screen_names": row.user_screen_names, "user_created_at": dt_to_s(row.user_created_at), "retweeted_user_id": row.retweeted_user_id, "retweeted_user_screen_name": row.retweeted_user_screen_name, "status_id": row.status_id, "status_text": row.status_text, "status_created_at": dt_to_s(row.status_created_at) }) counter += 1 if counter % BATCH_SIZE == 0: print(logstamp(), fmt_n(counter)) self.retweets_df = DataFrame(records) self.retweets_df.index.name = "row_id" self.retweets_df.index += 1 print("WRITING TO FILE...") self.retweets_df.to_csv(self.retweets_filepath)
def vectorize(self): print("--------------------------") print("VECTORIZING...") self.tv = TfidfVectorizer() self.tv.fit(self.x_train) print("FEATURES / TOKENS:", fmt_n(len(self.tv.get_feature_names()))) self.matrix_train = self.tv.transform(self.x_train) print("FEATURE MATRIX (TRAIN):", type(self.matrix_train), self.matrix_train.shape) self.matrix_test = self.tv.transform(self.x_test) print("FEATURE MATRIX (TEST):", type(self.matrix_test), self.matrix_test.shape)
def perform(self): self.write_metadata_to_file() self.upload_metadata() self.start() self.graph = DiGraph() self.running_results = [] for row in self.bq_service.fetch_retweet_counts_in_batches(topic=self.topic, start_at=self.convo_start_at, end_at=self.convo_end_at): # see: https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.add_edge.html#networkx.DiGraph.add_edge self.graph.add_edge(row["user_screen_name"], row["retweet_user_screen_name"], rt_count=row["retweet_count"]) self.counter += 1 if self.counter % self.batch_size == 0: rr = {"ts": logstamp(), "counter": self.counter, "nodes": len(self.graph.nodes), "edges": len(self.graph.edges)} print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.running_results.append(rr) self.end() self.report() self.write_results_to_file() self.upload_results() self.write_graph_to_file() self.upload_graph()
def perform(self): self.save_metadata() self.start() self.results = [] self.graph = DiGraph() for row in self.bq_service.fetch_retweet_counts_in_batches( start_at=dt_to_s(self.tweets_start_at), end_at=dt_to_s(self.tweets_end_at)): self.graph.add_edge( row["user_screen_name"], # todo: user_id row["retweet_user_screen_name"], # todo: retweet_user_id weight=row["retweet_count"]) self.counter += 1 if self.counter % self.batch_size == 0: rr = { "ts": logstamp(), "counter": self.counter, "nodes": self.graph.number_of_nodes(), "edges": self.graph.number_of_edges() } print(rr["ts"], "|", fmt_n(rr["counter"]), "|", fmt_n(rr["nodes"]), "|", fmt_n(rr["edges"])) self.results.append(rr) # gets us an approximate users limit but reached a fraction of the time (perhaps more performant when there are millions of rows) if self.users_limit and self.counter >= self.users_limit: break self.end() self.report() self.save_results() self.save_graph()
def get_tweets(): bq_service = BigQueryService() print("LIMIT:", LIMIT) job = Job() tweets = [] job.start() for row in bq_service.fetch_labeled_tweets_in_batches(limit=LIMIT): tweets.append(dict(row)) job.counter += 1 if job.counter % BATCH_SIZE == 0: job.progress_report() job.end() print("FETCHED TWEETS:", fmt_n(len(tweets))) return DataFrame(tweets)
def perform_better(self): print("----------------") print(f"FETCHING TEXTS...") print(f"SCORING TEXTS IN BATCHES...") batch = [] counter = 0 for row in self.fetch_texts(): batch.append(row) if len(batch) >= self.batch_size: counter+=len(batch) print(" ", generate_timestamp(), "|", fmt_n(counter)) self.process_batch(batch) batch = []
def download_user_details(self): self.start_job() self.destructively_migrate(UserDetail) print(logstamp(), "DATA FLOWING...") for row in self.bq_service.fetch_user_details_in_batches( limit=self.users_limit): item = { "user_id": row['user_id'], "screen_name": clean_string(row['screen_name']), "name": clean_string(row['name']), "description": clean_string(row['description']), "location": clean_string(row['location']), "verified": row['verified'], "created_at": row['created_at'], #.strftime("%Y-%m-%d %H:%M:%S"), "screen_name_count": row['screen_name_count'], "name_count": row['name_count'], "description_count": row['description_count'], "location_count": row['location_count'], "verified_count": row['verified_count'], "created_count": row['created_at_count'], "screen_names": [clean_string(s) for s in row['screen_names']], "names": [clean_string(s) for s in row['names']], "descriptions": [clean_string(s) for s in row['descriptions']], "locations": [clean_string(s) for s in row['locations']], "verifieds": row['verifieds'], "created_ats": row['created_ats'], #[dt.strftime("%Y-%m-%d %H:%M:%S") for dt in row['_created_ats']] "friend_count": row["friend_count"], "status_count": row["status_count"], "retweet_count": row["retweet_count"], } self.batch.append(item) self.counter += 1 # temporarily testing individual inserts... #record = UserDetail(**item) #self.pg_session.add(record) #self.pg_session.commit() if len(self.batch) >= self.batch_size: print(logstamp(), fmt_n(self.counter), "SAVING BATCH...") self.pg_session.bulk_insert_mappings(UserDetail, self.batch) self.pg_session.commit() self.batch = [] self.end_job()