コード例 #1
0
class DictionaryBuilder:
    def __init__(self, **kwargs):
        self._source_parser = SDParser(filename=kwargs.pop('filepath'),
                                       sep=kwargs.pop('sep'))
        self._wordbase_builder = DatabaseWrapper(kwargs.pop('db_uri'))
        self.primary_col_index = -1

    def __delete__(self, instance):
        del self._source_parser
        del self._wordbase_builder
        pass

    def build(self,
              table_name: str,
              column_infos: dict,
              language: str,
              start=0,
              end=0):
        columns = self._wordbase_builder.generate_columns(
            column_infos, self.primary_col_index)
        self._wordbase_builder.create_table(table_name, columns)
        self._parse(table_name, columns, language, start, end)
        print("Database build for \"%s.%s\" finished." %
              (self._wordbase_builder.get_dbname(), table_name))

    def resume(self,
               table_name: str,
               column_infos: dict,
               language: str,
               primary_key=0,
               end=0):
        columns = self._wordbase_builder.generate_columns(
            column_infos, self.primary_col_index)
        start = self._wordbase_builder.resume_table(table_name, primary_key,
                                                    columns)
        self._parse(table_name, columns, language, start, end)
        pass

    def read(self, table_name, row_num=-1, col_num=-1):
        return self._wordbase_builder.fetch_row(table_name, row_num, col_num)

    def _variable_row_values(self, language, rows: dict, row_values: str,
                             columns: list, start_index: int) -> None:
        translated_values = self.translate_tag(load_tag(language), row_values)
        index = start_index
        for value in translated_values:
            rows[columns[index].name] = value
            index += 1

    def _parse(self, table_name, columns, language, start=0, end=0):
        parsed = self._source_parser.parse_lines(0, (2, 3),
                                                 start=start,
                                                 end=end)
        for words, paradigms in zip(parsed[0].values(), parsed[1].values()):
            for word, paradigm in zip(words, paradigms):
                row_values = {'word': word}
                self._variable_row_values(language, row_values, paradigm,
                                          columns, 2)
                self._wordbase_builder.insert_values(table_name, row_values)
        pass

    @staticmethod
    def translate_tag(dictionary: dict, row: str):
        values = row.split(' ')
        translated_values = list()
        index = 0
        for value in values:
            for key, cmp_value in zip(dictionary.keys(), dictionary.values()):
                if regex.match(r'<?' + cmp_value + r'(\d?|>?)', value, regex.I) is not None \
                        and key not in translated_values:
                    translated_values.insert(index, key)
                    index += 1
        return translated_values
コード例 #2
0
class DataManager:
    """
    Class for manipulation of data specific to this project's database.
    It essentially wraps around the DatabaseWrapper() class.
    This creates and communicates to the ornus database which has the following 
    structure:

    Tables:
        tweets: table with all the tweets from each coin
        twitter_users: table with all the twitter users that were found from
                       collecting tweets
        hashtags: table with all the hashtags found in tweets
        tweet_hashtag: many to many relationship between tweets and hashtags
        cryptocurrencies: a table of all the cryptocurrencies
    
        Then each cryptocurrency additionally also has its own table storing 
        its daily market data. So there is an additional 30 - 100 tables for 
        all the cryptocurrencies currently being collected
    """
    def __init__(self, coins):
        self.coins = coins
        self._database = DatabaseWrapper()

    def insert_hashtag(self, hashtag):
        """Will insert hashtag into the hashtag table"""
        _dict = {"name": hashtag}
        self._database.insert_into_table(_dict, "hashtags")

    def insert_twitter_user(self, twitter_user):
        """
        Will insert  a tweet into the 'twitter_users' table, with these columns:
            "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL",
            "date_created": "DATE",
            "followers": "INT UNSIGNED",
            "friends": "INT UNSIGNED",
        """
        self._database.insert_into_table(twitter_user, "twitter_users")

    def insert_tweet(self, tweet: dict):
        """
        Will insert  a tweet into the 'tweets' table, with these columns:
            "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL",
            "date": "DATE",
            "content": "VARCHAR(1120) CHARACTER SET utf8 COLLATE utf8_unicode_ci",
            "coin_id": "INT UNSIGNED NOT NULL",
            "sentiment": "FLOAT",
            "user_id": "BIGINT UNSIGNED NOT NULL",
            "retweets": "INT UNSIGNED",

        Will also add the hashtags to the database, and the twitter user to the database
        """
        self.insert_twitter_user(tweet["user"])

        formatted_tweet = {
            "id": tweet["id"],
            "date": tweet["date"],
            "content": tweet["text"],
            "coin_id": self.get_coin_id(tweet["coin"]),
            "sentiment": tweet["sentiment"],
            "user_id": tweet["user"]["id"],
            "retweets": tweet["retweets"]
        }
        if formatted_tweet["coin_id"] is not None:
            # The try except is for ignoring tweets that are not properly encoded and thus ignored
            try:
                self._database.insert_into_table(formatted_tweet, "tweets")
            except Exception as e:
                return

        # Insert the hashtags into the hashtag table and insert them into the
        # tweet_hashtag table for the many to many relationship between tweets
        # and hashtags
        for hashtag in tweet["hashtags"]:
            self.insert_hashtag(hashtag)
            tweet_hashtag = {
                "tweet_id": tweet["id"],
                "hashtag_id": self.get_hashtag_id(hashtag),
            }
            if None not in tweet_hashtag.values():
                self._database.insert_into_table(tweet_hashtag,
                                                 "tweet_hashtag")

    def get_hashtag_id(self, hashtag: str):
        """
        Returns the id of coin in the cryptocurrency table, 
        returns None if coin is not in the table
        :param hashtag: str of the hashtag
        """
        try:
            sql = "SELECT id FROM hashtags WHERE name = '{0}'".format(hashtag)
            result = self._database.query(sql)
        except:
            return None
        if result == []:
            return None
        return result[0][0]

    def get_coin_id(self, coin: str):
        """
        Returns the id of coin in the cryptocurrency table, 
        returns None if coin is not in the table
        :param coin: str of the name of the coin, note: not the ticker
        """
        sql = "SELECT id FROM cryptocurrencies WHERE name = '{0}'".format(coin)
        result = self._database.query(sql)
        if result == []:
            return None
        return result[0][0]

    def fill_cryptocurrency_table(self):
        """
        Will populate the cryptocurrency table in the database
        with everything from coins
        """
        for coin in self.coins:
            self._database.insert_into_table(entry=coin.schema(),
                                             table="cryptocurrencies")

    def fill_market_data_tables(self, sentiment_data: dict, verbose=False):
        """
        Populate each table for each individual cryptocurrency with its daily market data
        :param sentiment_data: dict storing all the twitter sentiment values for each coin
                               so its structure should be: 
                               {"coin1": [ ... ], "coin2": [ ... ], ... }
        :paramm verbose: bool on whether to periodically notify the user how much has been completed
        """
        for index, coin in enumerate(self.coins):
            average_sentiment = sentiment_data[
                coin.name]["sum"] / sentiment_data[coin.name]["length"]
            pos_percentage = sentiment_data[coin.name][
                "pos_sentiment"] / sentiment_data[coin.name]["length"]
            neg_percentage = sentiment_data[coin.name][
                "neg_sentiment"] / sentiment_data[coin.name]["length"]

            coin_data = coin.current_market_data()
            market_data = {
                "date": coin_data["date"],
                "open": coin_data["open"],
                "high": coin_data["high"],
                "low": coin_data["low"],
                "close": coin_data["close"],
                "volume": coin_data["volume"],
                "num_trades": coin_data["num_trades"],
                "positive_tweet_sentiment": pos_percentage,
                "negative_tweet_sentiment": neg_percentage,
                "average_tweet_sentiment": average_sentiment,
            }
            self._database.insert_into_table(market_data, coin.name)
            if (index + 1) % 10 == 0 and verbose:
                print("Processed market data for", (index + 1),
                      "of",
                      len(self.coins),
                      "coins.",
                      end=" ")
                print("Percent Complete: {:0.2f}".format(index /
                                                         len(self.coins)))

    def create_tables(self):
        """
        Creates all the tables Necessary for the data, if the data already exists
        it does nothing
        """
        cryptocurrency_table_schema = {
            "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL",
            "name": "VARCHAR(30) UNIQUE NOT NULL",
            "ticker": "VARCHAR(10) UNIQUE NOT NULL",
        }
        self._database.create_table("cryptocurrencies",
                                    cryptocurrency_table_schema)

        specific_crypto_schema = {
            "date": "DATE UNIQUE PRIMARY KEY NOT NULL",
            "open": "FLOAT",
            "high": "FLOAT",
            "low": "FLOAT",
            "close": "FLOAT",
            "volume": "FLOAT",
            "num_trades": "INT UNSIGNED",
            "positive_tweet_sentiment": "FLOAT",
            "negative_tweet_sentiment": "FLOAT",
            "average_tweet_sentiment": "FLOAT",
        }
        for coin in self.coins:
            self._database.create_table(coin.name, specific_crypto_schema)

        twitter_users_schema = {
            "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL",
            "date_created": "DATE",
            "followers": "INT UNSIGNED",
            "friends": "INT UNSIGNED",
        }
        self._database.create_table("twitter_users", twitter_users_schema)

        tweets_schema = {
            "id": "BIGINT UNSIGNED UNIQUE PRIMARY KEY NOT NULL",
            "date": "DATE",
            "content":
            "VARCHAR(1120) CHARACTER SET utf8 COLLATE utf8_unicode_ci",
            "coin_id": "INT UNSIGNED NOT NULL",
            "sentiment": "FLOAT",
            "user_id": "BIGINT UNSIGNED NOT NULL",
            "retweets": "INT UNSIGNED",
        }
        tweets_foreign_keys = {
            "coin_id": ("cryptocurrencies", "id"),
            "user_id": ("twitter_users", "id"),
        }
        self._database.create_table("tweets", tweets_schema,
                                    tweets_foreign_keys)

        hashtag_schema = {
            "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL",
            "name": "VARCHAR(50) UNIQUE NOT NULL",
        }
        self._database.create_table("hashtags", hashtag_schema)

        if "tweet_hashtag" not in self._database.show_tables():

            sql_for_tweet_hashtag = """
CREATE TABLE tweet_hashtag (
    tweet_id BIGINT UNSIGNED NOT NULL,
    hashtag_id INTEGER UNSIGNED NOT NULL,
    FOREIGN KEY (tweet_id) REFERENCES tweets (id) ON DELETE RESTRICT ON UPDATE CASCADE,
    FOREIGN KEY (hashtag_id) REFERENCES hashtags (id) ON DELETE RESTRICT ON UPDATE CASCADE,
    PRIMARY KEY (tweet_id, hashtag_id)
); """
            self._database.execute(sql_for_tweet_hashtag)

        reddit_comments_schema = {
            "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL",
            "date": "DATE",
            "content":
            "VARCHAR(8000) CHARACTER SET utf8 COLLATE utf8_unicode_ci",
            "coin_id": "INT UNSIGNED NOT NULL",
            "sentiment": "FLOAT",
            "user_id": "BIGINT UNSIGNED NOT NULL",
            "score": "INT UNSIGNED",
            "parent_id": "BIGINT UNSIGNED",
            "permalink": "VARCHAR(100)",
            "submission_id": "VARCHAR(15)",
        }
        self._database.create_table("reddit_comments", reddit_comments_schema)

        reddit_user_schema = {
            "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL",
            "username": "******",
            "date_created": "DATE",
            "link_karma": "INT UNSIGNED",
            "comment_karma": "INT UNSIGNED",
            "subreddit_1_id": "VARCHAR(15)",
            "subreddit_2_id": "VARCHAR(15)",
            "subreddit_3_id": "VARCHAR(15)",
        }
        self._database.create_table("reddit_users", reddit_user_schema)

        reddit_post_schema = {
            "id": "VARCHAR(20) UNIQUE PRIMARY KEY NOT NULL",
            "date": "DATE",
            "title": "VARCHAR(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci",
            "content":
            "VARCHAR(10000) CHARACTER SET utf8 COLLATE utf8_unicode_ci",
            "coin_id": "INT UNSIGNED NOT NULL",
            "sentiment": "FLOAT",
            "user_id": "VARCHAR(20)",
            "score": "INT UNSIGNED",
            "num_comments": "INT UNSIGNED",
            "upvote_percentage": "FLOAT UNSIGNED",
            "subreddit_id": "BIGINT UNSIGNED NOT NULL",
            "link": "VARCHAR(100)",
        }
        self._database.create_table("reddit_posts", reddit_post_schema)

        subreddit_schema = {
            "id": "INT UNSIGNED AUTO_INCREMENT PRIMARY KEY NOT NULL",
            "name": "VARCHAR(20) UNIQUE NOT NULL",
            "subscribers": "INT UNSIGNED",
            "date_created": "DATE",
        }
        self._database.create_table("subreddits", subreddit_schema)