Exemple #1
0
 def __init__(self):
     """Constructor for the TweetGetter class"""
     self.twitter_api = self.connect_twitter_api()
     self.api = None
     self.api_params = dict()
     self.api_limit = API_LIMIT
     self.outpath = str()
     self.aws_writer = AWSWriter()
Exemple #2
0
class TweetGetter:
    """Responsible for connecting to the Twitter Python API and writing data to json storage in S3"""
    def __init__(self):
        """Constructor for the TweetGetter class"""
        self.twitter_api = self.connect_twitter_api()
        self.api = None
        self.api_params = dict()
        self.api_limit = API_LIMIT
        self.outpath = str()
        self.aws_writer = AWSWriter()

    @staticmethod
    def connect_twitter_api():
        """Spin up a Twitter API object"""
        auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
        auth.set_access_token(ACCESS_KEY, ACCESS_SECRET)
        return tweepy.API(auth)

    def _get_data(self):
        """Instantiate a method for getting data from tweepy depending on the task for subclasses"""
        results = list()
        cursor = tweepy.Cursor(self.api,
                               **self.api_params,
                               tweet_mode="extended",
                               wait_on_rate_limit=True,
                               wait_on_rate_limit_notify=True,
                               count=200).items(self.api_limit)

        # Rewrite cursor pull to skip users that are private (401)
        while True:
            try:
                status = cursor.next()
                results.append(status._json)
            except tweepy.TweepError:
                return None
            except StopIteration:
                break

        return json.dumps(results, ensure_ascii=False)

    def _write_data(self, data):
        """Writes Twitter data to json files according to the specified outpath"""
        if data:
            filename = self.outpath + str(date.today()).replace("-",
                                                                "_") + ".json"
            self.aws_writer.write_json(data, filename)

    def _write_user_timeline(self, data):
        """Writes user timeline data"""
        if data:
            filename = self.outpath + ".json"
            self.aws_writer.write_json(data, filename)
Exemple #3
0
 def _write_model_file(self):
     """Write the model as a list of edges and community labels"""
     gml_file = "\n".join(
         nx.generate_gml(self.graph,
                         stringizer=nx.readwrite.gml.literal_stringizer))
     AWSWriter().write_model(
         gml_file, "models/network/" + self.topic + "_communities.gml")
Exemple #4
0
 def prepare_model_for_db_merge(self):
     """Exposed function for transforming internal model to nested list for neo4j merge"""
     # TODO: Update to include a persona param and write to a new folder
     nodes_list = list(self.graph.nodes(data=True))
     # need to format into nested list for Cipher query to unwind
     model_results = list()
     leader_list = list()
     for node_id, properties in nodes_list:
         if "community" in properties.keys():
             if "leader" in properties.keys():
                 if not node_id:
                     continue
                 else:
                     if properties["leader"]:
                         leader_list.append(node_id)
                     model_results.append([
                         node_id, properties["community"],
                         properties["leader"]
                     ])
     AWSWriter().write_model(
         str(leader_list), "models/network/" + self.topic + "_leaders.txt")
     return model_results
Exemple #5
0
 def prepare_model_for_db_merge2(self, persona):
     """Exposed function for transforming internal model to nested list for neo4j merge"""
     nodes_list = list(self.graph.nodes(data=True))
     # need to format into nested list for Cipher query to unwind
     model_results = list()
     persona_list = list()
     for node_id, properties in nodes_list:
         if "community" in properties.keys():
             if persona in properties.keys():
                 if not node_id:
                     continue
                 else:
                     if properties[persona]:
                         persona_list.append(node_id)
                     model_results.append([
                         node_id, properties["community"],
                         persona if properties[persona] else "None"
                     ])
     AWSWriter().write_model(
         str(persona_list),
         "models/network/" + self.topic + "_" + persona + ".txt")
     return model_results
Exemple #6
0
 def __init__(self, dev_bolt_uri=DEV_BOLT_URI, prod_bolt_uri=PROD_BOLT_URI):
     """Constructor for DatabaseManager class"""
     self.dev_driver = self.connect_database(dev_bolt_uri)
     self.prod_driver = self.connect_database(prod_bolt_uri)
     self.batch_size = BATCH_SIZE
     self.aws_writer = AWSWriter()
Exemple #7
0
class DatabaseManager:
    """Responsible for merging twitter json data into the development Neo4j database and for exporting
    data from dev DB to models"""
    def __init__(self, dev_bolt_uri=DEV_BOLT_URI, prod_bolt_uri=PROD_BOLT_URI):
        """Constructor for DatabaseManager class"""
        self.dev_driver = self.connect_database(dev_bolt_uri)
        self.prod_driver = self.connect_database(prod_bolt_uri)
        self.batch_size = BATCH_SIZE
        self.aws_writer = AWSWriter()

    @staticmethod
    def run_script(driver,
                   script_path,
                   args=None,
                   verbose=False,
                   get_values=False):
        """Helper function to run cypher queries"""
        with open(os.path.join(script_path)) as f:
            script = f.read()

        with driver.session() as session:
            res = session.run(script, args)
            if verbose:
                summary = res.summary().counters
                print(script_path)
                print(f"Nodes created: {summary.nodes_created}")
                print(
                    f"Relationships created: {summary.relationships_created}")
                print(f"Properties set: {summary.properties_set}")
            if get_values:
                return res

    @staticmethod
    def connect_database(bolt_uri):
        """Spin up a neo4j driver"""
        dev_driver = GraphDatabase.driver(bolt_uri,
                                          auth=(NEO_USERNAME, NEO_PASSWORD))
        return dev_driver

    @staticmethod
    def format_date(dateobject):
        """Formats a dateobject into MM_DD_YYYY str"""
        return str(dateobject).replace("-", "_")

    def destroy_production_database(self):
        """Remove all nodes and relationships in production database to prepare for rebuild"""
        deleted_nodes = -1
        while deleted_nodes != 0:
            results = self.run_script(self.prod_driver,
                                      os.path.join(ETL_QUERIES_PATH,
                                                   "destroy_database.cypher"),
                                      verbose=True,
                                      get_values=True)
            deleted_nodes = [x[0] for x in results][0]

    def load_data_into_model(self, topic, persona_query, full_graph=False):
        """Exposed function to execute a match query returning edge relationships"""
        args = {"search_term": topic}
        if full_graph:
            results = self.run_script(self.dev_driver,
                                      os.path.join(
                                          ETL_QUERIES_PATH,
                                          "export_full_network.cypher"),
                                      args=args,
                                      verbose=True,
                                      get_values=True)
        else:
            results = self.run_script(self.dev_driver,
                                      os.path.join(ETL_QUERIES_PATH,
                                                   persona_query),
                                      args=args,
                                      verbose=True,
                                      get_values=True)
        return list(map(tuple, results))

    def merge_model_results(self, model_results, output_query):
        """Merge model results back into Neo4j development database"""
        chunks = (len(model_results) - 1) // self.batch_size + 1
        for i in range(chunks):
            batch = model_results[i * self.batch_size:(i + 1) *
                                  self.batch_size]
            query = "UNWIND" + str(batch) + "AS line " + QUERIES[output_query]
            with self.dev_driver.session() as session:
                session.run(query)

    def _write_user_properties(self, user_properties):
        """Write user profile properties to Neo4j as (:User) SET properties"""
        args = {"properties": user_properties}
        self.run_script(self.prod_driver,
                        os.path.join(ETL_QUERIES_PATH,
                                     "import_individual_properties.cypher"),
                        args=args,
                        verbose=True)

    @staticmethod
    def _extract_tweet_properties(tweet, topic):
        """Extract tweet properties from a tweet json and return as a nested list"""
        tweet_properties = dict()
        property_names = [
            "id", "retweet_count", "favorite_count", "full_text", "created_at"
        ]
        for name in property_names:
            tweet_properties[name] = tweet[name]
        tweet_properties["user"] = tweet["user"]["id"]
        tweet_properties["topic"] = topic
        return tweet_properties

    def _write_popular_tweets(self, popular_tweets, topic):
        """Write popular tweets to Neo4j as (:User)-[:Tweeted]-(:Tweet)"""
        for tweet in popular_tweets:
            tweet_properties = self._extract_tweet_properties(tweet, topic)
            args = {"properties": tweet_properties}
            self.run_script(self.prod_driver,
                            os.path.join(ETL_QUERIES_PATH,
                                         "import_popular_tweets.cypher"),
                            args=args,
                            verbose=True)

    def _write_common_hashtags(self, user, common_hashtags, topic):
        """Write common hashtags as (:User)-[:COMMON_HASHTAG]-(:Hashtag)"""
        for hashtag, count in common_hashtags:
            args = {
                "properties": {
                    "name": hashtag,
                    "count": count,
                    "user": user,
                    "topic": topic
                }
            }
            self.run_script(self.prod_driver,
                            os.path.join(ETL_QUERIES_PATH,
                                         "import_common_hashtags.cypher"),
                            args=args,
                            verbose=True)

    def merge_user_profile(self, user_properties, popular_tweets,
                           common_hashtags, topic):
        """Merge user profile properties as User and Tweet nodes into prod database"""
        self._write_user_properties(user_properties)
        self._write_popular_tweets(popular_tweets, topic)
        self._write_common_hashtags(user_properties["user"], common_hashtags,
                                    topic)

    def _load_tweets_json(self, inpath, search_term, recent=True):
        """Load json data into Neo4j development database"""
        if recent:
            # default is to load in the last 24 hours worth of json data
            today = self.format_date(date.today())
            yesterday = self.format_date(date.today() - timedelta(1))
            today_json = self.aws_writer.read_json(inpath + today + ".json")
            yesterday_json = self.aws_writer.read_json(inpath + yesterday +
                                                       ".json")
            if today_json:
                data = json.loads(today_json)
                self.run_script(self.dev_driver,
                                os.path.join(ETL_QUERIES_PATH,
                                             "import_search_api_json.cypher"),
                                args={
                                    "json": data,
                                    "json_file": today,
                                    "search_term": search_term
                                },
                                verbose=True)
            if yesterday_json:
                data = json.loads(yesterday_json)
                self.run_script(self.dev_driver,
                                os.path.join(ETL_QUERIES_PATH,
                                             "import_search_api_json.cypher"),
                                args={
                                    "json": data,
                                    "json_file": yesterday,
                                    "search_term": search_term
                                },
                                verbose=True)
        else:
            # pull the entire tweet datastorage for a topic and load them all in
            tweet_filenames = self.aws_writer.get_all_filenames(
                TWEET_FILEPATH + search_term)
            for filename in tweet_filenames:
                data = json.loads(self.aws_writer.read_json(filename))
                filedate = re.search(r'\d{4}_\d{2}_\d{2}', filename).group()
                self.run_script(self.dev_driver,
                                os.path.join(ETL_QUERIES_PATH,
                                             "import_search_api_json.cypher"),
                                args={
                                    "json": data,
                                    "json_file": filedate,
                                    "search_term": search_term
                                },
                                verbose=True)

    def load_new_tweets(self, search_term, recent=True):
        """Exposed function to load in new tweets by search term into the development database"""
        inpath = "data/tweets/" + search_term + "_tweets_"
        self._load_tweets_json(inpath, search_term, recent)

    def load_nosentiment_tweets(self):
        """Exposed function to load in all tweets that do not have sentiment labels"""
        results = self.run_script(self.dev_driver,
                                  os.path.join(
                                      ETL_QUERIES_PATH,
                                      "export_nosentiment_tweets.cypher"),
                                  get_values=True)
        return [record for record in results.values()]

    def load_production_tweets(self):
        """Exposed function to load in all tweets that do not have sentiment labels"""
        results = self.run_script(self.prod_driver,
                                  os.path.join(
                                      ETL_QUERIES_PATH,
                                      "export_production_tweets.cypher"),
                                  get_values=True)
        return [record for record in results.values()]

    def migrate_dev_to_prod(self, export_users):
        """Migrates the dev database to production"""
        # pull user nodes from dev database
        export_args = {"users_to_migrate": export_users}
        results = self.run_script(
            self.dev_driver,
            os.path.join(ETL_QUERIES_PATH,
                         "export_migration_users_and_topics.cypher"),
            args=export_args,
            get_values=True)
        import_users = [({k: v
                          for k, v in x['u']._properties.items()}, x['t.name'])
                        for x in results.data()]
        [x[0].update({"topic_name": x[1]}) for x in import_users]
        import_users = [x[0] for x in import_users]
        # write user nodes to prod database
        import_args = {"users_to_migrate": import_users}
        self.run_script(self.prod_driver,
                        os.path.join(
                            ETL_QUERIES_PATH,
                            "import_migration_users_and_topics.cypher"),
                        args=import_args,
                        verbose=True)

    def ensure_topic_node(self, topic):
        """Merges the topic node into the production database"""
        args = {"topic": topic}
        self.run_script(self.prod_driver,
                        os.path.join(ETL_QUERIES_PATH,
                                     "import_topic_node.cypher"),
                        args=args,
                        verbose=True)
 def __init__(self):
     """Constructor for Capstone object"""
     # connect to AWS and database
     self.dbm = DatabaseManager()
     self.aws_writer = AWSWriter()
     self.user_limit = 200
class Capstone:
    """Capstone is responsible for each step in the end-to-end construction and maintainence of our database"""
    def __init__(self):
        """Constructor for Capstone object"""
        # connect to AWS and database
        self.dbm = DatabaseManager()
        self.aws_writer = AWSWriter()
        self.user_limit = 200

    def destroy_database(self):
        """Tear down whatever is currently in production database"""
        self.dbm.destroy_production_database()

    def load_new_tweets(self):
        """Load new tweets from AWS into the development database"""
        for topic in SEARCH_TOPICS:
            print(f"Loading tweets for {topic} into dev database")
            self.dbm.load_new_tweets(topic, recent=False)
            self.dbm.ensure_topic_node(topic)

    def run_info_map(self):
        """Apply info map on the development dataset to identify communities"""
        print(f"Applying InfoMap to entire graph")
        community_model = CommunityFinder(topic="full_graph")
        center_model = CenterFinder()
        topic_data = self.dbm.load_data_into_model(topic=None,
                                                   persona_query=None,
                                                   full_graph=True)
        community_model.fit(topic_data)
        center_model.fit(community_model.graph,
                         community_model.community_labels)
        community_results = community_model.prepare_model_for_db_merge3()
        self.dbm.merge_model_results(community_results, "output_network")

    def migrate_dev_to_prod(self):
        """Migrate community center users to the production database"""
        users_to_migrate = self.dbm.run_script(
            self.dbm.prod_driver,
            "./etl_queries/export_prod_users_without_tweets.cypher",
            get_values=True)
        users_to_migrate = [x["u.id"] for x in users_to_migrate]
        self.pull_user_timelines(users_to_migrate)
        self.dbm.migrate_dev_to_prod([int(user) for user in users_to_migrate])
        self.evaluate_user_personas(users_to_migrate)

    def pull_user_timelines(self, users_to_migrate):
        """Pull user timelines for user profile generation"""
        users_pulled = list()
        user_limit = self.user_limit
        for user in users_to_migrate:
            if user_limit > 0:
                if self.aws_writer.read_json(f"{user}.json"):
                    continue
                else:
                    UserTimelineGetter(user).get_user_timeline()
                    user_limit -= 1
                    users_pulled.append(user)
            else:
                break

    def get_user_behavior(self, user):
        """Pull a user's first degree connections and compute the count of relationship types"""
        export_args = {"users_to_migrate": [user]}
        results = self.dbm.run_script(
            self.dbm.dev_driver,
            "./etl_queries/export_migration_users_and_topics.cypher",
            args=export_args,
            get_values=True)
        user_topics = list(set([x['t.name'] for x in results.data()]))
        user_topic_stats = dict()
        for topic in user_topics:
            results = self.dbm.run_script(
                self.dbm.dev_driver,
                "./etl_queries/export_first_degree_connections.cypher",
                args={
                    "user_id": user,
                    "search_term": topic
                },
                get_values=True,
                verbose=False)
            results = [record for record in results.values()]
            if results:
                relationships = [
                    str(line[0]) + "->" + str(line[1]) for line in results
                ]
                rel_counts = {
                    "TWEETED->RETWEETED": 0,
                    "TWEETED->None": 0,
                    "RETWEETED->TWEETED": 0,
                    "TWEETED->MENTIONS": 0,
                }
                for rel in relationships:
                    if rel == "RETWEETED->RETWEETED" or rel == "RETWEETED->MENTIONS":
                        rel = "RETWEETED->TWEETED"
                    if rel not in rel_counts:
                        continue
                    rel_counts[rel] += 1
                user_topic_stats[topic] = rel_counts
        return user_topic_stats

    def evaluate_user_personas(self, users_to_migrate):
        """Generate persona roles for user nodes based on their twitter behavior"""
        user_rel_stats = dict()
        for user in users_to_migrate:
            user_rel_stats[user] = self.get_user_behavior(user)

        user_personas = list()
        for topic in SEARCH_TOPICS:
            # first, normalize all stats as percentage of relationships
            topic_subset = {
                user_id: topic_counts[topic]
                for user_id, topic_counts in user_rel_stats.items()
                if topic in topic_counts
            }

            for user_id, rel_counts in topic_subset.items():
                sum_rel = sum(rel_counts.values())
                if sum_rel == 0:
                    continue
                topic_subset[user_id] = {
                    rel: count / sum_rel
                    for rel, count in rel_counts.items()
                }

            num_users = len(topic_subset)
            # first, get the watchdogs as the top 25% of mentions and remove them from consideration
            topic_mentions = {
                user_id: topic_subset[user_id]["TWEETED->MENTIONS"]
                for user_id in topic_subset
            }
            watchdogs = sorted(topic_mentions.items(),
                               key=operator.itemgetter(1),
                               reverse=True)[:int(num_users * 0.25)]
            for user_id, score in watchdogs:
                user_personas.append({
                    "user_id": user_id,
                    "persona": "watchdog",
                    "search_term": topic
                })
            # remove watchdogs from remaining group
            topic_subset = {
                k: v
                for k, v in topic_subset.items() if k not in watchdogs
            }

            # second, get the amplifiers as the top 25% of retweets and remove them from consideration
            topic_retweets = {
                user_id: topic_subset[user_id]["RETWEETED->TWEETED"]
                for user_id in topic_subset
            }
            amplifiers = sorted(topic_retweets.items(),
                                key=operator.itemgetter(1),
                                reverse=True)[:int(num_users * 0.25)]
            for user_id, score in amplifiers:
                user_personas.append({
                    "user_id": user_id,
                    "persona": "amplifier",
                    "search_term": topic
                })
            # remove amplifiers from remaining group
            topic_subset = {
                k: v
                for k, v in topic_subset.items() if k not in amplifiers
            }

            # third, get the content creators as the top 25% of tweets and remove them from consideration
            topic_content = {
                user_id: topic_subset[user_id]["TWEETED->None"]
                for user_id in topic_subset
            }
            content_creators = sorted(topic_content.items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)[:int(num_users * 0.25)]
            for user_id, score in content_creators:
                user_personas.append({
                    "user_id": user_id,
                    "persona": "content_creator",
                    "search_term": topic
                })

        self.dbm.run_script(self.dbm.prod_driver,
                            "./etl_queries/import_user_personas.cypher",
                            args={"user_persona_list": user_personas},
                            verbose=True)

    def build_user_profiles(self):
        """Build user profiles using user tweet histories"""
        for topic in SEARCH_TOPICS:
            print(
                f"Generating user profiles for users who tweet about {topic}")
            for persona in PERSONAS:
                pb = ProfileBuilder(topic, persona, ignore_sentiment=True)
                any_profiles = pb.build_user_profiles2(
                )  # catch any False tag that is thrown
                any_profiles = pb.build_user_profiles3(
                )  # catch any False tag that is thrown
            self.dbm.run_script(
                self.dbm.prod_driver,
                "./etl_queries/compute_relative_statistics.cypher",
                args={"search_term": topic},
                verbose=True)

    def fill_truncated_retweets(self):
        """Fetch full tweet text for retweets which come in truncated"""
        def divide_chunks(l, n):
            # looping till length l
            for i in range(0, len(l), n):
                yield l[i:i + n]

        results = self.dbm.run_script(
            self.dbm.prod_driver,
            "./etl_queries/get_retweet_truncated_text.cypher",
            get_values=True)
        tweet_ids = [record[0] for record in results]
        full_text_map = dict()
        if len(tweet_ids) > 100:
            batch_size = 100
            batch_list = list(divide_chunks(tweet_ids, batch_size))
            for batch in batch_list:
                rftg = RetweetFullTextGetter(batch)
                full_text_map.update(rftg.get_tweet_full_text())
        else:
            rftg = RetweetFullTextGetter(tweet_ids)
            full_text_map = rftg.get_tweet_full_text()

        tweets_with_text = [{
            "id": k,
            "text": v
        } for k, v in full_text_map.items()]
        args = {"tweets_with_text": tweets_with_text}
        self.dbm.run_script(self.dbm.prod_driver,
                            "./etl_queries/import_retweet_full_text.cypher",
                            args=args,
                            verbose=True)

    @staticmethod
    def impute_user_locations():
        """Predict a user's location based on their immediate network of followers"""
        location_model = LocationPropagator(topic="homeless")
        location_model.impute_location()

    def clean_up(self):
        """Delete the tweets about relationship where there is no tweets that match"""
        self.dbm.run_script(self.dbm.prod_driver,
                            "./etl_queries/clean_up.cypher",
                            verbose=True)

    def end_to_end(self):
        self.destroy_database()
        self.load_new_tweets()
        self.run_info_map()
        self.migrate_dev_to_prod()
        self.pull_user_timelines()
        self.get_user_behavior()
        self.build_user_profiles()
        self.fill_truncated_retweets()
        self.impute_user_locations()
        self.clean_up()