def __init__(self): """Constructor for the TweetGetter class""" self.twitter_api = self.connect_twitter_api() self.api = None self.api_params = dict() self.api_limit = API_LIMIT self.outpath = str() self.aws_writer = AWSWriter()
class TweetGetter: """Responsible for connecting to the Twitter Python API and writing data to json storage in S3""" def __init__(self): """Constructor for the TweetGetter class""" self.twitter_api = self.connect_twitter_api() self.api = None self.api_params = dict() self.api_limit = API_LIMIT self.outpath = str() self.aws_writer = AWSWriter() @staticmethod def connect_twitter_api(): """Spin up a Twitter API object""" auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_KEY, ACCESS_SECRET) return tweepy.API(auth) def _get_data(self): """Instantiate a method for getting data from tweepy depending on the task for subclasses""" results = list() cursor = tweepy.Cursor(self.api, **self.api_params, tweet_mode="extended", wait_on_rate_limit=True, wait_on_rate_limit_notify=True, count=200).items(self.api_limit) # Rewrite cursor pull to skip users that are private (401) while True: try: status = cursor.next() results.append(status._json) except tweepy.TweepError: return None except StopIteration: break return json.dumps(results, ensure_ascii=False) def _write_data(self, data): """Writes Twitter data to json files according to the specified outpath""" if data: filename = self.outpath + str(date.today()).replace("-", "_") + ".json" self.aws_writer.write_json(data, filename) def _write_user_timeline(self, data): """Writes user timeline data""" if data: filename = self.outpath + ".json" self.aws_writer.write_json(data, filename)
def _write_model_file(self): """Write the model as a list of edges and community labels""" gml_file = "\n".join( nx.generate_gml(self.graph, stringizer=nx.readwrite.gml.literal_stringizer)) AWSWriter().write_model( gml_file, "models/network/" + self.topic + "_communities.gml")
def prepare_model_for_db_merge(self): """Exposed function for transforming internal model to nested list for neo4j merge""" # TODO: Update to include a persona param and write to a new folder nodes_list = list(self.graph.nodes(data=True)) # need to format into nested list for Cipher query to unwind model_results = list() leader_list = list() for node_id, properties in nodes_list: if "community" in properties.keys(): if "leader" in properties.keys(): if not node_id: continue else: if properties["leader"]: leader_list.append(node_id) model_results.append([ node_id, properties["community"], properties["leader"] ]) AWSWriter().write_model( str(leader_list), "models/network/" + self.topic + "_leaders.txt") return model_results
def prepare_model_for_db_merge2(self, persona): """Exposed function for transforming internal model to nested list for neo4j merge""" nodes_list = list(self.graph.nodes(data=True)) # need to format into nested list for Cipher query to unwind model_results = list() persona_list = list() for node_id, properties in nodes_list: if "community" in properties.keys(): if persona in properties.keys(): if not node_id: continue else: if properties[persona]: persona_list.append(node_id) model_results.append([ node_id, properties["community"], persona if properties[persona] else "None" ]) AWSWriter().write_model( str(persona_list), "models/network/" + self.topic + "_" + persona + ".txt") return model_results
def __init__(self, dev_bolt_uri=DEV_BOLT_URI, prod_bolt_uri=PROD_BOLT_URI): """Constructor for DatabaseManager class""" self.dev_driver = self.connect_database(dev_bolt_uri) self.prod_driver = self.connect_database(prod_bolt_uri) self.batch_size = BATCH_SIZE self.aws_writer = AWSWriter()
class DatabaseManager: """Responsible for merging twitter json data into the development Neo4j database and for exporting data from dev DB to models""" def __init__(self, dev_bolt_uri=DEV_BOLT_URI, prod_bolt_uri=PROD_BOLT_URI): """Constructor for DatabaseManager class""" self.dev_driver = self.connect_database(dev_bolt_uri) self.prod_driver = self.connect_database(prod_bolt_uri) self.batch_size = BATCH_SIZE self.aws_writer = AWSWriter() @staticmethod def run_script(driver, script_path, args=None, verbose=False, get_values=False): """Helper function to run cypher queries""" with open(os.path.join(script_path)) as f: script = f.read() with driver.session() as session: res = session.run(script, args) if verbose: summary = res.summary().counters print(script_path) print(f"Nodes created: {summary.nodes_created}") print( f"Relationships created: {summary.relationships_created}") print(f"Properties set: {summary.properties_set}") if get_values: return res @staticmethod def connect_database(bolt_uri): """Spin up a neo4j driver""" dev_driver = GraphDatabase.driver(bolt_uri, auth=(NEO_USERNAME, NEO_PASSWORD)) return dev_driver @staticmethod def format_date(dateobject): """Formats a dateobject into MM_DD_YYYY str""" return str(dateobject).replace("-", "_") def destroy_production_database(self): """Remove all nodes and relationships in production database to prepare for rebuild""" deleted_nodes = -1 while deleted_nodes != 0: results = self.run_script(self.prod_driver, os.path.join(ETL_QUERIES_PATH, "destroy_database.cypher"), verbose=True, get_values=True) deleted_nodes = [x[0] for x in results][0] def load_data_into_model(self, topic, persona_query, full_graph=False): """Exposed function to execute a match query returning edge relationships""" args = {"search_term": topic} if full_graph: results = self.run_script(self.dev_driver, os.path.join( ETL_QUERIES_PATH, "export_full_network.cypher"), args=args, verbose=True, get_values=True) else: results = self.run_script(self.dev_driver, os.path.join(ETL_QUERIES_PATH, persona_query), args=args, verbose=True, get_values=True) return list(map(tuple, results)) def merge_model_results(self, model_results, output_query): """Merge model results back into Neo4j development database""" chunks = (len(model_results) - 1) // self.batch_size + 1 for i in range(chunks): batch = model_results[i * self.batch_size:(i + 1) * self.batch_size] query = "UNWIND" + str(batch) + "AS line " + QUERIES[output_query] with self.dev_driver.session() as session: session.run(query) def _write_user_properties(self, user_properties): """Write user profile properties to Neo4j as (:User) SET properties""" args = {"properties": user_properties} self.run_script(self.prod_driver, os.path.join(ETL_QUERIES_PATH, "import_individual_properties.cypher"), args=args, verbose=True) @staticmethod def _extract_tweet_properties(tweet, topic): """Extract tweet properties from a tweet json and return as a nested list""" tweet_properties = dict() property_names = [ "id", "retweet_count", "favorite_count", "full_text", "created_at" ] for name in property_names: tweet_properties[name] = tweet[name] tweet_properties["user"] = tweet["user"]["id"] tweet_properties["topic"] = topic return tweet_properties def _write_popular_tweets(self, popular_tweets, topic): """Write popular tweets to Neo4j as (:User)-[:Tweeted]-(:Tweet)""" for tweet in popular_tweets: tweet_properties = self._extract_tweet_properties(tweet, topic) args = {"properties": tweet_properties} self.run_script(self.prod_driver, os.path.join(ETL_QUERIES_PATH, "import_popular_tweets.cypher"), args=args, verbose=True) def _write_common_hashtags(self, user, common_hashtags, topic): """Write common hashtags as (:User)-[:COMMON_HASHTAG]-(:Hashtag)""" for hashtag, count in common_hashtags: args = { "properties": { "name": hashtag, "count": count, "user": user, "topic": topic } } self.run_script(self.prod_driver, os.path.join(ETL_QUERIES_PATH, "import_common_hashtags.cypher"), args=args, verbose=True) def merge_user_profile(self, user_properties, popular_tweets, common_hashtags, topic): """Merge user profile properties as User and Tweet nodes into prod database""" self._write_user_properties(user_properties) self._write_popular_tweets(popular_tweets, topic) self._write_common_hashtags(user_properties["user"], common_hashtags, topic) def _load_tweets_json(self, inpath, search_term, recent=True): """Load json data into Neo4j development database""" if recent: # default is to load in the last 24 hours worth of json data today = self.format_date(date.today()) yesterday = self.format_date(date.today() - timedelta(1)) today_json = self.aws_writer.read_json(inpath + today + ".json") yesterday_json = self.aws_writer.read_json(inpath + yesterday + ".json") if today_json: data = json.loads(today_json) self.run_script(self.dev_driver, os.path.join(ETL_QUERIES_PATH, "import_search_api_json.cypher"), args={ "json": data, "json_file": today, "search_term": search_term }, verbose=True) if yesterday_json: data = json.loads(yesterday_json) self.run_script(self.dev_driver, os.path.join(ETL_QUERIES_PATH, "import_search_api_json.cypher"), args={ "json": data, "json_file": yesterday, "search_term": search_term }, verbose=True) else: # pull the entire tweet datastorage for a topic and load them all in tweet_filenames = self.aws_writer.get_all_filenames( TWEET_FILEPATH + search_term) for filename in tweet_filenames: data = json.loads(self.aws_writer.read_json(filename)) filedate = re.search(r'\d{4}_\d{2}_\d{2}', filename).group() self.run_script(self.dev_driver, os.path.join(ETL_QUERIES_PATH, "import_search_api_json.cypher"), args={ "json": data, "json_file": filedate, "search_term": search_term }, verbose=True) def load_new_tweets(self, search_term, recent=True): """Exposed function to load in new tweets by search term into the development database""" inpath = "data/tweets/" + search_term + "_tweets_" self._load_tweets_json(inpath, search_term, recent) def load_nosentiment_tweets(self): """Exposed function to load in all tweets that do not have sentiment labels""" results = self.run_script(self.dev_driver, os.path.join( ETL_QUERIES_PATH, "export_nosentiment_tweets.cypher"), get_values=True) return [record for record in results.values()] def load_production_tweets(self): """Exposed function to load in all tweets that do not have sentiment labels""" results = self.run_script(self.prod_driver, os.path.join( ETL_QUERIES_PATH, "export_production_tweets.cypher"), get_values=True) return [record for record in results.values()] def migrate_dev_to_prod(self, export_users): """Migrates the dev database to production""" # pull user nodes from dev database export_args = {"users_to_migrate": export_users} results = self.run_script( self.dev_driver, os.path.join(ETL_QUERIES_PATH, "export_migration_users_and_topics.cypher"), args=export_args, get_values=True) import_users = [({k: v for k, v in x['u']._properties.items()}, x['t.name']) for x in results.data()] [x[0].update({"topic_name": x[1]}) for x in import_users] import_users = [x[0] for x in import_users] # write user nodes to prod database import_args = {"users_to_migrate": import_users} self.run_script(self.prod_driver, os.path.join( ETL_QUERIES_PATH, "import_migration_users_and_topics.cypher"), args=import_args, verbose=True) def ensure_topic_node(self, topic): """Merges the topic node into the production database""" args = {"topic": topic} self.run_script(self.prod_driver, os.path.join(ETL_QUERIES_PATH, "import_topic_node.cypher"), args=args, verbose=True)
def __init__(self): """Constructor for Capstone object""" # connect to AWS and database self.dbm = DatabaseManager() self.aws_writer = AWSWriter() self.user_limit = 200
class Capstone: """Capstone is responsible for each step in the end-to-end construction and maintainence of our database""" def __init__(self): """Constructor for Capstone object""" # connect to AWS and database self.dbm = DatabaseManager() self.aws_writer = AWSWriter() self.user_limit = 200 def destroy_database(self): """Tear down whatever is currently in production database""" self.dbm.destroy_production_database() def load_new_tweets(self): """Load new tweets from AWS into the development database""" for topic in SEARCH_TOPICS: print(f"Loading tweets for {topic} into dev database") self.dbm.load_new_tweets(topic, recent=False) self.dbm.ensure_topic_node(topic) def run_info_map(self): """Apply info map on the development dataset to identify communities""" print(f"Applying InfoMap to entire graph") community_model = CommunityFinder(topic="full_graph") center_model = CenterFinder() topic_data = self.dbm.load_data_into_model(topic=None, persona_query=None, full_graph=True) community_model.fit(topic_data) center_model.fit(community_model.graph, community_model.community_labels) community_results = community_model.prepare_model_for_db_merge3() self.dbm.merge_model_results(community_results, "output_network") def migrate_dev_to_prod(self): """Migrate community center users to the production database""" users_to_migrate = self.dbm.run_script( self.dbm.prod_driver, "./etl_queries/export_prod_users_without_tweets.cypher", get_values=True) users_to_migrate = [x["u.id"] for x in users_to_migrate] self.pull_user_timelines(users_to_migrate) self.dbm.migrate_dev_to_prod([int(user) for user in users_to_migrate]) self.evaluate_user_personas(users_to_migrate) def pull_user_timelines(self, users_to_migrate): """Pull user timelines for user profile generation""" users_pulled = list() user_limit = self.user_limit for user in users_to_migrate: if user_limit > 0: if self.aws_writer.read_json(f"{user}.json"): continue else: UserTimelineGetter(user).get_user_timeline() user_limit -= 1 users_pulled.append(user) else: break def get_user_behavior(self, user): """Pull a user's first degree connections and compute the count of relationship types""" export_args = {"users_to_migrate": [user]} results = self.dbm.run_script( self.dbm.dev_driver, "./etl_queries/export_migration_users_and_topics.cypher", args=export_args, get_values=True) user_topics = list(set([x['t.name'] for x in results.data()])) user_topic_stats = dict() for topic in user_topics: results = self.dbm.run_script( self.dbm.dev_driver, "./etl_queries/export_first_degree_connections.cypher", args={ "user_id": user, "search_term": topic }, get_values=True, verbose=False) results = [record for record in results.values()] if results: relationships = [ str(line[0]) + "->" + str(line[1]) for line in results ] rel_counts = { "TWEETED->RETWEETED": 0, "TWEETED->None": 0, "RETWEETED->TWEETED": 0, "TWEETED->MENTIONS": 0, } for rel in relationships: if rel == "RETWEETED->RETWEETED" or rel == "RETWEETED->MENTIONS": rel = "RETWEETED->TWEETED" if rel not in rel_counts: continue rel_counts[rel] += 1 user_topic_stats[topic] = rel_counts return user_topic_stats def evaluate_user_personas(self, users_to_migrate): """Generate persona roles for user nodes based on their twitter behavior""" user_rel_stats = dict() for user in users_to_migrate: user_rel_stats[user] = self.get_user_behavior(user) user_personas = list() for topic in SEARCH_TOPICS: # first, normalize all stats as percentage of relationships topic_subset = { user_id: topic_counts[topic] for user_id, topic_counts in user_rel_stats.items() if topic in topic_counts } for user_id, rel_counts in topic_subset.items(): sum_rel = sum(rel_counts.values()) if sum_rel == 0: continue topic_subset[user_id] = { rel: count / sum_rel for rel, count in rel_counts.items() } num_users = len(topic_subset) # first, get the watchdogs as the top 25% of mentions and remove them from consideration topic_mentions = { user_id: topic_subset[user_id]["TWEETED->MENTIONS"] for user_id in topic_subset } watchdogs = sorted(topic_mentions.items(), key=operator.itemgetter(1), reverse=True)[:int(num_users * 0.25)] for user_id, score in watchdogs: user_personas.append({ "user_id": user_id, "persona": "watchdog", "search_term": topic }) # remove watchdogs from remaining group topic_subset = { k: v for k, v in topic_subset.items() if k not in watchdogs } # second, get the amplifiers as the top 25% of retweets and remove them from consideration topic_retweets = { user_id: topic_subset[user_id]["RETWEETED->TWEETED"] for user_id in topic_subset } amplifiers = sorted(topic_retweets.items(), key=operator.itemgetter(1), reverse=True)[:int(num_users * 0.25)] for user_id, score in amplifiers: user_personas.append({ "user_id": user_id, "persona": "amplifier", "search_term": topic }) # remove amplifiers from remaining group topic_subset = { k: v for k, v in topic_subset.items() if k not in amplifiers } # third, get the content creators as the top 25% of tweets and remove them from consideration topic_content = { user_id: topic_subset[user_id]["TWEETED->None"] for user_id in topic_subset } content_creators = sorted(topic_content.items(), key=operator.itemgetter(1), reverse=True)[:int(num_users * 0.25)] for user_id, score in content_creators: user_personas.append({ "user_id": user_id, "persona": "content_creator", "search_term": topic }) self.dbm.run_script(self.dbm.prod_driver, "./etl_queries/import_user_personas.cypher", args={"user_persona_list": user_personas}, verbose=True) def build_user_profiles(self): """Build user profiles using user tweet histories""" for topic in SEARCH_TOPICS: print( f"Generating user profiles for users who tweet about {topic}") for persona in PERSONAS: pb = ProfileBuilder(topic, persona, ignore_sentiment=True) any_profiles = pb.build_user_profiles2( ) # catch any False tag that is thrown any_profiles = pb.build_user_profiles3( ) # catch any False tag that is thrown self.dbm.run_script( self.dbm.prod_driver, "./etl_queries/compute_relative_statistics.cypher", args={"search_term": topic}, verbose=True) def fill_truncated_retweets(self): """Fetch full tweet text for retweets which come in truncated""" def divide_chunks(l, n): # looping till length l for i in range(0, len(l), n): yield l[i:i + n] results = self.dbm.run_script( self.dbm.prod_driver, "./etl_queries/get_retweet_truncated_text.cypher", get_values=True) tweet_ids = [record[0] for record in results] full_text_map = dict() if len(tweet_ids) > 100: batch_size = 100 batch_list = list(divide_chunks(tweet_ids, batch_size)) for batch in batch_list: rftg = RetweetFullTextGetter(batch) full_text_map.update(rftg.get_tweet_full_text()) else: rftg = RetweetFullTextGetter(tweet_ids) full_text_map = rftg.get_tweet_full_text() tweets_with_text = [{ "id": k, "text": v } for k, v in full_text_map.items()] args = {"tweets_with_text": tweets_with_text} self.dbm.run_script(self.dbm.prod_driver, "./etl_queries/import_retweet_full_text.cypher", args=args, verbose=True) @staticmethod def impute_user_locations(): """Predict a user's location based on their immediate network of followers""" location_model = LocationPropagator(topic="homeless") location_model.impute_location() def clean_up(self): """Delete the tweets about relationship where there is no tweets that match""" self.dbm.run_script(self.dbm.prod_driver, "./etl_queries/clean_up.cypher", verbose=True) def end_to_end(self): self.destroy_database() self.load_new_tweets() self.run_info_map() self.migrate_dev_to_prod() self.pull_user_timelines() self.get_user_behavior() self.build_user_profiles() self.fill_truncated_retweets() self.impute_user_locations() self.clean_up()