class DictionaryBuilder: def __init__(self, **kwargs): self._source_parser = SDParser(filename=kwargs.pop('filepath'), sep=kwargs.pop('sep')) self._wordbase_builder = DatabaseWrapper(kwargs.pop('db_uri')) self.primary_col_index = -1 def __delete__(self, instance): del self._source_parser del self._wordbase_builder pass def build(self, table_name: str, column_infos: dict, language: str, start=0, end=0): columns = self._wordbase_builder.generate_columns( column_infos, self.primary_col_index) self._wordbase_builder.create_table(table_name, columns) self._parse(table_name, columns, language, start, end) print("Database build for \"%s.%s\" finished." % (self._wordbase_builder.get_dbname(), table_name)) def resume(self, table_name: str, column_infos: dict, language: str, primary_key=0, end=0): columns = self._wordbase_builder.generate_columns( column_infos, self.primary_col_index) start = self._wordbase_builder.resume_table(table_name, primary_key, columns) self._parse(table_name, columns, language, start, end) pass def read(self, table_name, row_num=-1, col_num=-1): return self._wordbase_builder.fetch_row(table_name, row_num, col_num) def _variable_row_values(self, language, rows: dict, row_values: str, columns: list, start_index: int) -> None: translated_values = self.translate_tag(load_tag(language), row_values) index = start_index for value in translated_values: rows[columns[index].name] = value index += 1 def _parse(self, table_name, columns, language, start=0, end=0): parsed = self._source_parser.parse_lines(0, (2, 3), start=start, end=end) for words, paradigms in zip(parsed[0].values(), parsed[1].values()): for word, paradigm in zip(words, paradigms): row_values = {'word': word} self._variable_row_values(language, row_values, paradigm, columns, 2) self._wordbase_builder.insert_values(table_name, row_values) pass @staticmethod def translate_tag(dictionary: dict, row: str): values = row.split(' ') translated_values = list() index = 0 for value in values: for key, cmp_value in zip(dictionary.keys(), dictionary.values()): if regex.match(r'<?' + cmp_value + r'(\d?|>?)', value, regex.I) is not None \ and key not in translated_values: translated_values.insert(index, key) index += 1 return translated_values
class DataManager: """ Class for manipulation of data specific to this project's database. It essentially wraps around the DatabaseWrapper() class. This creates and communicates to the ornus database which has the following structure: Tables: tweets: table with all the tweets from each coin twitter_users: table with all the twitter users that were found from collecting tweets hashtags: table with all the hashtags found in tweets tweet_hashtag: many to many relationship between tweets and hashtags cryptocurrencies: a table of all the cryptocurrencies Then each cryptocurrency additionally also has its own table storing its daily market data. So there is an additional 30 - 100 tables for all the cryptocurrencies currently being collected """ def __init__(self, coins): self.coins = coins self._database = DatabaseWrapper() def insert_hashtag(self, hashtag): """Will insert hashtag into the hashtag table""" _dict = {"name": hashtag} self._database.insert_into_table(_dict, "hashtags") def insert_twitter_user(self, twitter_user): """ Will insert a tweet into the 'twitter_users' table, with these columns: "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL", "date_created": "DATE", "followers": "INT UNSIGNED", "friends": "INT UNSIGNED", """ self._database.insert_into_table(twitter_user, "twitter_users") def insert_tweet(self, tweet: dict): """ Will insert a tweet into the 'tweets' table, with these columns: "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL", "date": "DATE", "content": "VARCHAR(1120) CHARACTER SET utf8 COLLATE utf8_unicode_ci", "coin_id": "INT UNSIGNED NOT NULL", "sentiment": "FLOAT", "user_id": "BIGINT UNSIGNED NOT NULL", "retweets": "INT UNSIGNED", Will also add the hashtags to the database, and the twitter user to the database """ self.insert_twitter_user(tweet["user"]) formatted_tweet = { "id": tweet["id"], "date": tweet["date"], "content": tweet["text"], "coin_id": self.get_coin_id(tweet["coin"]), "sentiment": tweet["sentiment"], "user_id": tweet["user"]["id"], "retweets": tweet["retweets"] } if formatted_tweet["coin_id"] is not None: # The try except is for ignoring tweets that are not properly encoded and thus ignored try: self._database.insert_into_table(formatted_tweet, "tweets") except Exception as e: return # Insert the hashtags into the hashtag table and insert them into the # tweet_hashtag table for the many to many relationship between tweets # and hashtags for hashtag in tweet["hashtags"]: self.insert_hashtag(hashtag) tweet_hashtag = { "tweet_id": tweet["id"], "hashtag_id": self.get_hashtag_id(hashtag), } if None not in tweet_hashtag.values(): self._database.insert_into_table(tweet_hashtag, "tweet_hashtag") def get_hashtag_id(self, hashtag: str): """ Returns the id of coin in the cryptocurrency table, returns None if coin is not in the table :param hashtag: str of the hashtag """ try: sql = "SELECT id FROM hashtags WHERE name = '{0}'".format(hashtag) result = self._database.query(sql) except: return None if result == []: return None return result[0][0] def get_coin_id(self, coin: str): """ Returns the id of coin in the cryptocurrency table, returns None if coin is not in the table :param coin: str of the name of the coin, note: not the ticker """ sql = "SELECT id FROM cryptocurrencies WHERE name = '{0}'".format(coin) result = self._database.query(sql) if result == []: return None return result[0][0] def fill_cryptocurrency_table(self): """ Will populate the cryptocurrency table in the database with everything from coins """ for coin in self.coins: self._database.insert_into_table(entry=coin.schema(), table="cryptocurrencies") def fill_market_data_tables(self, sentiment_data: dict, verbose=False): """ Populate each table for each individual cryptocurrency with its daily market data :param sentiment_data: dict storing all the twitter sentiment values for each coin so its structure should be: {"coin1": [ ... ], "coin2": [ ... ], ... } :paramm verbose: bool on whether to periodically notify the user how much has been completed """ for index, coin in enumerate(self.coins): average_sentiment = sentiment_data[ coin.name]["sum"] / sentiment_data[coin.name]["length"] pos_percentage = sentiment_data[coin.name][ "pos_sentiment"] / sentiment_data[coin.name]["length"] neg_percentage = sentiment_data[coin.name][ "neg_sentiment"] / sentiment_data[coin.name]["length"] coin_data = coin.current_market_data() market_data = { "date": coin_data["date"], "open": coin_data["open"], "high": coin_data["high"], "low": coin_data["low"], "close": coin_data["close"], "volume": coin_data["volume"], "num_trades": coin_data["num_trades"], "positive_tweet_sentiment": pos_percentage, "negative_tweet_sentiment": neg_percentage, "average_tweet_sentiment": average_sentiment, } self._database.insert_into_table(market_data, coin.name) if (index + 1) % 10 == 0 and verbose: print("Processed market data for", (index + 1), "of", len(self.coins), "coins.", end=" ") print("Percent Complete: {:0.2f}".format(index / len(self.coins))) def create_tables(self): """ Creates all the tables Necessary for the data, if the data already exists it does nothing """ cryptocurrency_table_schema = { "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL", "name": "VARCHAR(30) UNIQUE NOT NULL", "ticker": "VARCHAR(10) UNIQUE NOT NULL", } self._database.create_table("cryptocurrencies", cryptocurrency_table_schema) specific_crypto_schema = { "date": "DATE UNIQUE PRIMARY KEY NOT NULL", "open": "FLOAT", "high": "FLOAT", "low": "FLOAT", "close": "FLOAT", "volume": "FLOAT", "num_trades": "INT UNSIGNED", "positive_tweet_sentiment": "FLOAT", "negative_tweet_sentiment": "FLOAT", "average_tweet_sentiment": "FLOAT", } for coin in self.coins: self._database.create_table(coin.name, specific_crypto_schema) twitter_users_schema = { "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL", "date_created": "DATE", "followers": "INT UNSIGNED", "friends": "INT UNSIGNED", } self._database.create_table("twitter_users", twitter_users_schema) tweets_schema = { "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL", "date": "DATE", "content": "VARCHAR(1120) CHARACTER SET utf8 COLLATE utf8_unicode_ci", "coin_id": "INT UNSIGNED NOT NULL", "sentiment": "FLOAT", "user_id": "BIGINT UNSIGNED NOT NULL", "retweets": "INT UNSIGNED", } tweets_foreign_keys = { "coin_id": ("cryptocurrencies", "id"), "user_id": ("twitter_users", "id"), } self._database.create_table("tweets", tweets_schema, tweets_foreign_keys) hashtag_schema = { "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL", "name": "VARCHAR(50) UNIQUE NOT NULL", } self._database.create_table("hashtags", hashtag_schema) if "tweet_hashtag" not in self._database.show_tables(): sql_for_tweet_hashtag = """ CREATE TABLE tweet_hashtag ( tweet_id BIGINT UNSIGNED NOT NULL, hashtag_id INTEGER UNSIGNED NOT NULL, FOREIGN KEY (tweet_id) REFERENCES tweets (id) ON DELETE RESTRICT ON UPDATE CASCADE, FOREIGN KEY (hashtag_id) REFERENCES hashtags (id) ON DELETE RESTRICT ON UPDATE CASCADE, PRIMARY KEY (tweet_id, hashtag_id) ); """ self._database.execute(sql_for_tweet_hashtag) reddit_comments_schema = { "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL", "date": "DATE", "content": "VARCHAR(8000) CHARACTER SET utf8 COLLATE utf8_unicode_ci", "coin_id": "INT UNSIGNED NOT NULL", "sentiment": "FLOAT", "user_id": "BIGINT UNSIGNED NOT NULL", "score": "INT UNSIGNED", "parent_id": "BIGINT UNSIGNED", "permalink": "VARCHAR(100)", "submission_id": "VARCHAR(15)", } self._database.create_table("reddit_comments", reddit_comments_schema) reddit_user_schema = { "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL", "username": "******", "date_created": "DATE", "link_karma": "INT UNSIGNED", "comment_karma": "INT UNSIGNED", "subreddit_1_id": "VARCHAR(15)", "subreddit_2_id": "VARCHAR(15)", "subreddit_3_id": "VARCHAR(15)", } self._database.create_table("reddit_users", reddit_user_schema) reddit_post_schema = { "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL", "date": "DATE", "title": "VARCHAR(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci", "content": "VARCHAR(10000) CHARACTER SET utf8 COLLATE utf8_unicode_ci", "coin_id": "INT UNSIGNED NOT NULL", "sentiment": "FLOAT", "user_id": "VARCHAR(20)", "score": "INT UNSIGNED", "num_comments": "INT UNSIGNED", "upvote_percentage": "FLOAT UNSIGNED", "subreddit_id": "BIGINT UNSIGNED NOT NULL", "link": "VARCHAR(100)", } self._database.create_table("reddit_posts", reddit_post_schema) subreddit_schema = { "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL", "name": "VARCHAR(20) UNIQUE NOT NULL", "subscribers": "INT UNSIGNED", "date_created": "DATE", } self._database.create_table("subreddits", subreddit_schema)