Example #1
0
def reset_db(db='test', **db_settings):
    if db == 'test':
        db_settings = {'host': 'localhost', 'db': 'tweets_test', 'user': '******',
                       'passwd': environ.get('TWEETS_TESTDB_PASSWORD')}
        redis_settings = {'host': 'localhost', 'db': '0'}
    elif db == 'mbp2':
        db_settings = {'host': 'localhost',
                       'db': 'twitter',
                       'user': '******',
                       'passwd': environ.get('TWEETS_DB_PASSWORD')}
        redis_settings = {'host': 'localhost', 'db': 2}
    else:
        if not db_settings:
            raise ValueError("Must supply valid db name or settings")
    cache = SafeRedis(**redis_settings)
    cache.flushdb()
    print "Flushed Redis cache"
    db = MySQLdb.connect(**db_settings)
    cursor = db.cursor()
    print 'Wiping {db_name}'.format(db_name=db)
    try:
        for table in (
                'cluster', 'hashtag', 'hashtag_to_cluster', 'hashtag_to_tweet', 'tag_word', 'tag_word_to_hashtag', 'url',
                'url_to_tweet', 'user_to_tweet', 'user_to_url', 'word', 'word_to_tweet', 'twitter_user', 'tweet'):
            cursor.execute("""DELETE FROM `{table}`;""".format(table=table))
            print "Wiped {table}".format(table=table)
    finally:
        db.commit()
Example #2
0
def reset_db(db='test', **db_settings):
    if db == 'test':
        db_settings = {
            'host': 'localhost',
            'db': 'tweets_test',
            'user': '******',
            'passwd': environ.get('TWEETS_TESTDB_PASSWORD')
        }
        redis_settings = {'host': 'localhost', 'db': '0'}
    elif db == 'mbp2':
        db_settings = {
            'host': 'localhost',
            'db': 'twitter',
            'user': '******',
            'passwd': environ.get('TWEETS_DB_PASSWORD')
        }
        redis_settings = {'host': 'localhost', 'db': 2}
    else:
        if not db_settings:
            raise ValueError("Must supply valid db name or settings")
    cache = SafeRedis(**redis_settings)
    cache.flushdb()
    print "Flushed Redis cache"
    db = MySQLdb.connect(**db_settings)
    cursor = db.cursor()
    print 'Wiping {db_name}'.format(db_name=db)
    try:
        for table in ('cluster', 'hashtag', 'hashtag_to_cluster',
                      'hashtag_to_tweet', 'tag_word', 'tag_word_to_hashtag',
                      'url', 'url_to_tweet', 'user_to_tweet', 'user_to_url',
                      'word', 'word_to_tweet', 'twitter_user', 'tweet'):
            cursor.execute("""DELETE FROM `{table}`;""".format(table=table))
            print "Wiped {table}".format(table=table)
    finally:
        db.commit()
Example #3
0
 def __init__(self, use_testdb=False, show_sql=True):
     """
     Initialize the Inserter.
     :param test: whether to use the test db/Redis settings
     :type test: bool
     """
     if use_testdb:
         db_settings = {
             'host': 'localhost',
             'db': 'tweets_test',
             'user': '******',
             'passwd': environ.get('TWEETS_TESTDB_PASSWORD')
         }
         redis_settings = {'host': 'localhost', 'db': '0'}
     else:
         db_settings = {
             'host': 'localhost',
             'db': 'twitter',
             'user': '******',
             'passwd': environ.get('TWEETS_DB_PASSWORD')
         }
         redis_settings = {'host': 'localhost', 'db': '2'}
     self.show_sql = show_sql
     self.DB = MySQLdb.connect(**db_settings)
     self.cursor = self.DB.cursor()
     self.cache = SafeRedis(**redis_settings)
     for table in ('cluster', 'hashtag', 'tag_word', 'url', 'word'):
         key = 'pk_{table}'.format(table=table)
         if self.cache.get(key) is None:
             self.cache.set(key, 0)
     self.escape_word = self.DB.escape
Example #4
0
 def __init__(self, use_testdb=False, show_sql=True):
     """
     Initialize the Inserter.
     :param test: whether to use the test db/Redis settings
     :type test: bool
     """
     if use_testdb:
         db_settings = {'host': 'localhost',
                        'db': 'tweets_test',
                        'user': '******',
                        'passwd': environ.get('TWEETS_TESTDB_PASSWORD')}
         redis_settings = {'host': 'localhost',
                           'db': '0'}
     else:
         db_settings = {'host': 'localhost',
                        'db': 'twitter',
                        'user': '******',
                        'passwd': environ.get('TWEETS_DB_PASSWORD')}
         redis_settings = {'host': 'localhost',
                           'db': '2'}
     self.show_sql = show_sql
     self.DB = MySQLdb.connect(**db_settings)
     self.cursor = self.DB.cursor()
     self.cache = SafeRedis(**redis_settings)
     for table in ('cluster', 'hashtag', 'tag_word', 'url', 'word'):
         key = 'pk_{table}'.format(table=table)
         if self.cache.get(key) is None:
             self.cache.set(key, 0)
     self.escape_word = self.DB.escape
Example #5
0
ROUTER_IP = '24.186.113.22'
REDIS_HOST = '24.186.113.22'
REDIS_PORT = '6666'
REDIS_DB = 1

DOMAIN_PAT = re.compile(r'https?://([\w\d\.\-]+\.\w{2,3})')
ERR_PAT = re.compile(r'host=\'([\w\d\.]+)\'')

if os.path.exists(os.path.expanduser("~/PycharmProjects/tweet_stuff")):
    home_dir = os.path.expanduser("~/PycharmProjects/tweet_stuff")
elif os.path.exists(os.path.expanduser("~/tweet_stuff")):
    home_dir = os.path.expanduser("~/tweet_stuff")
IN_DIR = os.path.join(home_dir, "extracted2")
OUT_DIR = os.path.join(home_dir, "fixed")

CONN = SafeRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)


def resolve_redirects(url):
    print "Resolving {}".format(url)
    cached = CONN.get(url)
    if cached:
        return cached
    session = requests.session()
    try:
        with closing(session.head(url, timeout=300)) as req:
            r = req
    except (exceptions.RequestException, socket.error) as e:
        try:
            requests.head('http://{}'.format(ROUTER_IP)).close()
            CONN.set(url, url)
Example #6
0
class Inserter(object):
    """
    Class to handle insertion of twitter data into the database.
    NB: tweet ids and user ids are 18- and 9-digit integers, respectively. These values are represented as strings in the
    data, and treated as such in the code, but are translated into bigints by MySQLdb.
    """

    def __init__(self, use_testdb=False, show_sql=True):
        """
        Initialize the Inserter.
        :param test: whether to use the test db/Redis settings
        :type test: bool
        """
        if use_testdb:
            db_settings = {'host': 'localhost',
                           'db': 'tweets_test',
                           'user': '******',
                           'passwd': environ.get('TWEETS_TESTDB_PASSWORD')}
            redis_settings = {'host': 'localhost',
                              'db': '0'}
        else:
            db_settings = {'host': 'localhost',
                           'db': 'twitter',
                           'user': '******',
                           'passwd': environ.get('TWEETS_DB_PASSWORD')}
            redis_settings = {'host': 'localhost',
                              'db': '2'}
        self.show_sql = show_sql
        self.DB = MySQLdb.connect(**db_settings)
        self.cursor = self.DB.cursor()
        self.cache = SafeRedis(**redis_settings)
        for table in ('cluster', 'hashtag', 'tag_word', 'url', 'word'):
            key = 'pk_{table}'.format(table=table)
            if self.cache.get(key) is None:
                self.cache.set(key, 0)
        self.escape_word = self.DB.escape

    def get_or_set_pk(self, table, val, get_only=False):
        """
        Try to retrieve the primary key of a datum from the Redis cache. If none is found, increment the cached pk
        counter for the table in question and use that.
        :param table: the db table the entry is going to be added to
        :type table: str (one of 'cluster', 'hashtag', 'hashtag_to_cluster', 'tag_word', 'tag_word_to_hashtag', 'url',
            'url_to_tweet', 'user_to_tweet', 'word', 'word_to_tweet')
        :param val: the value in question
        :type val: str, probably
        :param get_only: disable creation of new pks, raising an error if the value doesn't already have a pk in the cache
        :type get_only: bool
        :return: int
        """
        key = '{table}_{val}'.format(table=table, val=val[:355])
        cached_pk = self.cache.get(key)
        if cached_pk is not None:
            return int(cached_pk)  # redis-py returns strings by default
        else:
            if get_only:
                raise ValueError("No primary key found for value {val} for table {table}".format(val=val, table=table))
            new_pk = self.cache.incr('pk_{table}'.format(table=table))  # .incr increments the value by 1 and returns the result as an int
            self.cache.set(key, new_pk)
            return new_pk

    def insert_tweet_data(self, tid, hour, day, is_weekday):
        """
        Insert all tweet-related data. NB: this only inserts data for the 'tweet' column, NOT all the data about a
        tweet.
        :param id: the id of the tweet
        :type id: str
        :param hour: the hour of the day the tweet was tweeted
        :type hour: int (0-23)
        :param day: the day of the week the tweet was tweeted
        :type day: int (0-7)
        :param is_weekday: whether the tweet was tweeted on a weekday
        :type is_weekday: int (0 or 1)
        :return: None
        """
        if hour == "-1":
            hour = "NULL"
        if day == "-1":
            day = "NULL"
        if is_weekday == "-1":
            is_weekday = "NULL"
        try:
            tweet_insert_command = """
                    INSERT INTO `tweet` (tweet_id, hour, day, is_weekday)
                    VALUES({id},{hour},{day},{is_w})
                    ON DUPLICATE KEY UPDATE tweet_id={id}, hour={hour}, day={day}, is_weekday={is_w}
                """.format(id=tid, hour=hour, day=day, is_w=is_weekday)
            if self.show_sql:
                print tweet_insert_command
            self.cursor.execute(tweet_insert_command)
        except MySQLdb.IntegrityError:
            print "Tweet with id {tid} already exists".format(tid=tid)

    def insert_mention(self, tweet_id, user_id):
        """
        Insert mentions. Per this schema, a mention is the same as a user tweeting a tweet, except 'is_mention' is set
        to 1 (i.e., True).
        :param tweet_id: the id of the tweet
        :type tweet_id: str
        :param user_id: the id of the user mentioned in the tweet
        :type user_id: str
        :return: None
        """
        add_user_command = """
            INSERT IGNORE INTO `twitter_user` (user_id) VALUES({user_id})
            """.format(user_id=user_id)
        if self.show_sql:
            print add_user_command
        self.cursor.execute(add_user_command)
        
        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})
            """.format(tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)
        
        mention_insert_command = """
                INSERT INTO `user_to_tweet` (tweet_id, user_id, is_mention)
                VALUES({tid},{uid},{is_m});""".format(tid=tweet_id, uid=user_id, is_m=1)
        if self.show_sql:
            print mention_insert_command
        self.cursor.execute(mention_insert_command)
        

    def insert_hashtag(self, hashtag_str, tweet_id):
        """
        Insert hashtag-related data.
        :param hashtag_str: the text of the hashtag
        :type hashtag_str: str
        :param tweet_id: the id of the tweet the hashtag came from
        :type tweet_id: str
        :return: int (primary key of hashtag)
        """
        hashtag_pk = self.get_or_set_pk('hashtag', hashtag_str)
        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})
            """.format(tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)
        
        try:
            hashtag_insert_command = """
                    INSERT INTO `hashtag` (id, str)
                    VALUES({id}, '{h_str}');""".format(id=hashtag_pk, h_str=self.escape_word(hashtag_str))
            if self.show_sql:
                print hashtag_insert_command
            self.cursor.execute(hashtag_insert_command)
            
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "Hashtag {ht} already in DB...skipping...".format(ht=self.escape_word(hashtag_str))
        hashtag_to_tweet_insert_command = """
              INSERT INTO `hashtag_to_tweet` (hashtag_id, tweet_id) VALUES({hid}, {tid})
            """.format(hid=hashtag_pk, tid=tweet_id)
        if self.show_sql:
            print hashtag_to_tweet_insert_command
        self.cursor.execute(hashtag_to_tweet_insert_command)
        
        return hashtag_pk

    def insert_tag_word(self, tag_word, hashtag_id):
        """
        Insert data about a word contained in a hashtag.
        :param tag_word: the word
        :type tag_word: str
        :param hashtag_id: primary key of the hashtag the word came from
        :type hashtag_id: int
        :return: None
        """
        tag_word_pk = self.get_or_set_pk('tag_word', tag_word)
        add_hashtag_command = """
            INSERT IGNORE INTO `hashtag` (id) VALUES({hashtag_id})
            """.format(hashtag_id=hashtag_id)
        if self.show_sql:
            print add_hashtag_command
        self.cursor.execute(add_hashtag_command)
        
        try:
            tag_word_insert_command = """
                    INSERT INTO `tag_word` (id, str)
                    VALUES({id}, '{tw_str}');""".format(id=tag_word_pk, tw_str=tag_word)
            if self.show_sql:
                print tag_word_insert_command
            self.cursor.execute(tag_word_insert_command)
            
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print """Tag word {tw} already in DB...skipping...""".format(tw=tag_word)
        tag_word_to_hashtag_insert_command = """
                INSERT INTO `tag_word_to_hashtag` (tag_word_id, hashtag_id)
                VALUES({wid}, {hid});""".format(wid=tag_word_pk, hid=hashtag_id)
        if self.show_sql:
            print tag_word_to_hashtag_insert_command
        self.cursor.execute(tag_word_to_hashtag_insert_command)
        

    def insert_twitter_user(self, user_id, follow_ratio, followers, following, offset):
        """
        Insert data about a twitter user.
        :param user_id: the user's id
        :type user_id: str
        :param follow_ratio: the ratio of users the user follows / users the user is followed by
        :type follow_ratio: float
        :param followers: the number of users following this user
        :type followers: int
        :param following: the number of users this user follows
        :type following: int
        :param offset: the user's offset from UTC
        :type offset: int or NULL
        :param tweet_id: the id of the tweet the user tweeted
        :type tweet_id: str
        :return: None
        """
        try:
            offset = int(offset)
        except TypeError:
            offset = "NULL"
        try:
            user_insert_command = """
                    INSERT INTO `twitter_user` (user_id, follow_ratio, followers, following, offset)
                    VALUES({id}, {ratio}, {ers}, {ing}, {off})
                    ON DUPLICATE KEY UPDATE user_id={id}, follow_ratio={ratio}, followers={ers}, following={ing},
                    offset={off};
                """.format(id=user_id, ratio=follow_ratio if (isinstance(followers, int) and followers >= 0) else "NULL",
                           ers=followers if (isinstance(followers, int) and followers >= 0) else "NULL",
                           ing=following if (isinstance(following, int) and following >= 0) else "NULL", off=offset)
            if self.show_sql:
                print user_insert_command
            self.cursor.execute(user_insert_command)
            
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "User with id {uid} already exists in DB...skipping...".format(uid=user_id)

    def insert_url(self, url, domain, tweet_id=None, user_id=None, user_url=False):
        """
        Insert data about a URL. URLs are found in one of two places: the body of the tweet itself, or in the profile of
        the user who tweeted the tweet. This method dispatches the appropriate method in either situation.
        :param url: the url
        :type url: str
        :param domain: the url's domain
        :type domain: str
        :param tweet_id: the id of the tweet the url appears in
        :type tweet_id: str or None
        :param user_id: the id of the user in whose profile the url occurs
        :type user_id: str or None
        :param user_url: whether the url occurs in a user's profile
        :type user_url: bool
        :return: None
        """
        url_pk = self.get_or_set_pk('url', url)
        if "'" in url:
            url_tpl = '"{url_str}"'
        else:
            url_tpl = "'{url_str}'"
        if tweet_id is not None:
            add_tweet_command = """
                INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})""".format(tweet_id=tweet_id)
            if self.show_sql:
                print add_tweet_command
            self.cursor.execute(add_tweet_command)
            
        if user_id is not None and user_url:
            add_user_command = """
                INSERT IGNORE INTO `twitter_user` (user_id) VALUE({user_id})""".format(user_id=user_id)
            if self.show_sql:
                print add_user_command
            self.cursor.execute(add_user_command)
            
        try:
            url_insert_command = """
                    INSERT INTO `url` (id, str, domain)
                    VALUES({url_id},{url_tpl},'{domain}');""".format(url_id=url_pk,
                                                                       url_tpl=url_tpl.format(url_str=self.escape_word(url)),
                                                                       domain=self.escape_word(domain))
            if self.show_sql:
                print url_insert_command
            self.cursor.execute(url_insert_command)
            
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print """URL {url} already exists in DB...skipping...""".format(url=self.escape_word(url))
        if user_url:
            self.insert_user_url(user_id, url_pk)
        else:
            self.insert_tweet_url(tweet_id, url_pk)

    def insert_user_url(self, user_id, url_pk):
        """
        Insert user id and url pk into user_to_url table. Don't call this directly.
        :param user_id: the id of the user in whose profile the url occurs
        :type user_id: str
        :param url_pk: the primary key of the url
        :type url_pk: int
        :return: None
        """
        if not user_id:
            raise ValueError('Must specify user_id')
        else:

            user_url_insert_command = """
                INSERT INTO `user_to_url` (user_id, url_id)
                VALUES({user_id},{url_id});""".format(user_id=user_id, url_id=url_pk)
            if self.show_sql:
                print user_url_insert_command
            self.cursor.execute(user_url_insert_command)
            

    def insert_tweet_url(self, tweet_id, url_pk):
        """
        Insert tweet id and url pk into url_to_tweet table. Don't call this directly.
        :param tweet_id:
        :param url_pk:
        :return: None
        """
        if not tweet_id:
            raise ValueError('Must specify tweet_id')
        else:
            tweet_url_insert_command = """
                INSERT INTO `url_to_tweet` (tweet_id, url_id)
                VALUES({tweet_id},{url_id});""".format(tweet_id=tweet_id, url_id=url_pk)
            if self.show_sql:
                print tweet_url_insert_command
            self.cursor.execute(tweet_url_insert_command)
            

    def insert_word(self, word_str, tweet_id):
        """
        Insert data about a word.
        :param word_str: the word
        :type word_str: str
        :param tweet_id: the id of the tweet the word was found in
        :type tweet_id: str
        :return: None
        """
        word_pk = self.get_or_set_pk('word', word_str)
        word_str = word_str.replace("'", "[Q]").replace('"', "[QQ]")
        if "\\" in word_str:
            word_str = word_str.replace("\\", "\\\\")
        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})""".format(tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)
        try:
            word_insert_command = """
                    INSERT INTO `word` (id, str)
                    VALUES({w_pk}, '{w_str}');""".format(w_pk=word_pk, w_str=word_str)
            if self.show_sql:
                print word_insert_command
            self.cursor.execute(word_insert_command)
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "Word {word} already exists in DB...skipping...".format(word=word_str)
        except MySQLdb.ProgrammingError:
            return None
        word_to_tweet_insert_command = """
                INSERT INTO `word_to_tweet` (word_id, tweet_id) VALUES({wid}, {tid})
                        """.format(wid=word_pk, tid=tweet_id)
        if self.show_sql:
            print word_to_tweet_insert_command
        self.cursor.execute(word_to_tweet_insert_command)
        

    def process_json(self, js):
        """
        Process a JSON representation of a tweet and insert all the relevant information into the database. Also inserts
        user_to_tweet data.
        :param js: JSON representation of a tweet
        :type js: list
        :return: None
        """
        tweet_id = js[0]
        user_id = js[1][0]['user']
        self.process_tweet(tweet_id, js[1][0])
        self.process_user(user_id, js[1][1])
        user_to_tweet_insert_command = """
                INSERT INTO `user_to_tweet` (user_id, tweet_id, is_mention)
                VALUES({uid}, {tid}, {is_m});""".format(uid=user_id, tid=tweet_id, is_m=0)
        if self.show_sql:
            print user_to_tweet_insert_command
        self.cursor.execute(user_to_tweet_insert_command)
        

    def process_tweet(self, tweet_id, tweet_data):
        """
        Process the tweet data.
        :param tweet_id: the tweet's id
        :type tweet_id: str
        :param tweet_data: the data extracted from the tweet
        :type tweet_data: dict
        :return: None
        """
        self.insert_tweet_data(tweet_id, tweet_data['hour'], tweet_data['day'], tweet_data['is_weekday'])
        hashtags = tweet_data.get('hashtags')
        split_hashtags = tweet_data.get('split_hashtags')
        for idx, hashtag in enumerate(hashtags):  # hashtags & split_hashtags
            hashtag_id = self.insert_hashtag(hashtag, tweet_id)
            split_hashtag = split_hashtags[idx]
            for tag_word in split_hashtag:
                self.insert_tag_word(tag_word, hashtag_id)
        urls = tweet_data['urls']
        if urls:  # not all tweet have urls
            domains = tweet_data['domains']
            for idx, url in enumerate(urls):  # urls & domains
                self.insert_url(url, domains[idx], tweet_id=tweet_id)
        for word in tweet_data['words']:  # words
            self.insert_word(word, tweet_id)

    def process_user(self, user_id, user_data):
        """
        Process the user data.
        :param user_id: the user's id
        :type user_id: str
        :param user_data: the data extracted from the user profile
        :return: None
        """
        followers = user_data['followers'] if user_data['followers'] != '-1' else "NULL"
        following = user_data['following'] if user_data['following'] != '-1' else "NULL"
        ratio = user_data['follow_ratio'] if followers != 'NULL' else 'NULL'
        self.insert_twitter_user(user_id, ratio, followers, following, user_data.get('offset', 'NULL'))
        user_urls = user_data['urls']
        if user_urls:  # not all user profiles have urls
            user_domains = user_data['domains']
            for idx, user_url in enumerate(user_urls):  # urls & domains
                self.insert_url(user_url, user_domains[idx], user_id=user_id, user_url=True)
Example #7
0
class Inserter(object):
    """
    Class to handle insertion of twitter data into the database.
    NB: tweet ids and user ids are 18- and 9-digit integers, respectively. These values are represented as strings in the
    data, and treated as such in the code, but are translated into bigints by MySQLdb.
    """
    def __init__(self, use_testdb=False, show_sql=True):
        """
        Initialize the Inserter.
        :param test: whether to use the test db/Redis settings
        :type test: bool
        """
        if use_testdb:
            db_settings = {
                'host': 'localhost',
                'db': 'tweets_test',
                'user': '******',
                'passwd': environ.get('TWEETS_TESTDB_PASSWORD')
            }
            redis_settings = {'host': 'localhost', 'db': '0'}
        else:
            db_settings = {
                'host': 'localhost',
                'db': 'twitter',
                'user': '******',
                'passwd': environ.get('TWEETS_DB_PASSWORD')
            }
            redis_settings = {'host': 'localhost', 'db': '2'}
        self.show_sql = show_sql
        self.DB = MySQLdb.connect(**db_settings)
        self.cursor = self.DB.cursor()
        self.cache = SafeRedis(**redis_settings)
        for table in ('cluster', 'hashtag', 'tag_word', 'url', 'word'):
            key = 'pk_{table}'.format(table=table)
            if self.cache.get(key) is None:
                self.cache.set(key, 0)
        self.escape_word = self.DB.escape

    def get_or_set_pk(self, table, val, get_only=False):
        """
        Try to retrieve the primary key of a datum from the Redis cache. If none is found, increment the cached pk
        counter for the table in question and use that.
        :param table: the db table the entry is going to be added to
        :type table: str (one of 'cluster', 'hashtag', 'hashtag_to_cluster', 'tag_word', 'tag_word_to_hashtag', 'url',
            'url_to_tweet', 'user_to_tweet', 'word', 'word_to_tweet')
        :param val: the value in question
        :type val: str, probably
        :param get_only: disable creation of new pks, raising an error if the value doesn't already have a pk in the cache
        :type get_only: bool
        :return: int
        """
        key = '{table}_{val}'.format(table=table, val=val[:355])
        cached_pk = self.cache.get(key)
        if cached_pk is not None:
            return int(cached_pk)  # redis-py returns strings by default
        else:
            if get_only:
                raise ValueError(
                    "No primary key found for value {val} for table {table}".
                    format(val=val, table=table))
            new_pk = self.cache.incr(
                'pk_{table}'.format(table=table)
            )  # .incr increments the value by 1 and returns the result as an int
            self.cache.set(key, new_pk)
            return new_pk

    def insert_tweet_data(self, tid, hour, day, is_weekday):
        """
        Insert all tweet-related data. NB: this only inserts data for the 'tweet' column, NOT all the data about a
        tweet.
        :param id: the id of the tweet
        :type id: str
        :param hour: the hour of the day the tweet was tweeted
        :type hour: int (0-23)
        :param day: the day of the week the tweet was tweeted
        :type day: int (0-7)
        :param is_weekday: whether the tweet was tweeted on a weekday
        :type is_weekday: int (0 or 1)
        :return: None
        """
        if hour == "-1":
            hour = "NULL"
        if day == "-1":
            day = "NULL"
        if is_weekday == "-1":
            is_weekday = "NULL"
        try:
            tweet_insert_command = """
                    INSERT INTO `tweet` (tweet_id, hour, day, is_weekday)
                    VALUES({id},{hour},{day},{is_w})
                    ON DUPLICATE KEY UPDATE tweet_id={id}, hour={hour}, day={day}, is_weekday={is_w}
                """.format(id=tid, hour=hour, day=day, is_w=is_weekday)
            if self.show_sql:
                print tweet_insert_command
            self.cursor.execute(tweet_insert_command)
        except MySQLdb.IntegrityError:
            print "Tweet with id {tid} already exists".format(tid=tid)

    def insert_mention(self, tweet_id, user_id):
        """
        Insert mentions. Per this schema, a mention is the same as a user tweeting a tweet, except 'is_mention' is set
        to 1 (i.e., True).
        :param tweet_id: the id of the tweet
        :type tweet_id: str
        :param user_id: the id of the user mentioned in the tweet
        :type user_id: str
        :return: None
        """
        add_user_command = """
            INSERT IGNORE INTO `twitter_user` (user_id) VALUES({user_id})
            """.format(user_id=user_id)
        if self.show_sql:
            print add_user_command
        self.cursor.execute(add_user_command)

        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})
            """.format(tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)

        mention_insert_command = """
                INSERT INTO `user_to_tweet` (tweet_id, user_id, is_mention)
                VALUES({tid},{uid},{is_m});""".format(tid=tweet_id,
                                                      uid=user_id,
                                                      is_m=1)
        if self.show_sql:
            print mention_insert_command
        self.cursor.execute(mention_insert_command)

    def insert_hashtag(self, hashtag_str, tweet_id):
        """
        Insert hashtag-related data.
        :param hashtag_str: the text of the hashtag
        :type hashtag_str: str
        :param tweet_id: the id of the tweet the hashtag came from
        :type tweet_id: str
        :return: int (primary key of hashtag)
        """
        hashtag_pk = self.get_or_set_pk('hashtag', hashtag_str)
        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})
            """.format(tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)

        try:
            hashtag_insert_command = """
                    INSERT INTO `hashtag` (id, str)
                    VALUES({id}, '{h_str}');""".format(
                id=hashtag_pk, h_str=self.escape_word(hashtag_str))
            if self.show_sql:
                print hashtag_insert_command
            self.cursor.execute(hashtag_insert_command)

        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "Hashtag {ht} already in DB...skipping...".format(
                    ht=self.escape_word(hashtag_str))
        hashtag_to_tweet_insert_command = """
              INSERT INTO `hashtag_to_tweet` (hashtag_id, tweet_id) VALUES({hid}, {tid})
            """.format(hid=hashtag_pk, tid=tweet_id)
        if self.show_sql:
            print hashtag_to_tweet_insert_command
        self.cursor.execute(hashtag_to_tweet_insert_command)

        return hashtag_pk

    def insert_tag_word(self, tag_word, hashtag_id):
        """
        Insert data about a word contained in a hashtag.
        :param tag_word: the word
        :type tag_word: str
        :param hashtag_id: primary key of the hashtag the word came from
        :type hashtag_id: int
        :return: None
        """
        tag_word_pk = self.get_or_set_pk('tag_word', tag_word)
        add_hashtag_command = """
            INSERT IGNORE INTO `hashtag` (id) VALUES({hashtag_id})
            """.format(hashtag_id=hashtag_id)
        if self.show_sql:
            print add_hashtag_command
        self.cursor.execute(add_hashtag_command)

        try:
            tag_word_insert_command = """
                    INSERT INTO `tag_word` (id, str)
                    VALUES({id}, '{tw_str}');""".format(id=tag_word_pk,
                                                        tw_str=tag_word)
            if self.show_sql:
                print tag_word_insert_command
            self.cursor.execute(tag_word_insert_command)

        except MySQLdb.IntegrityError:
            if self.show_sql:
                print """Tag word {tw} already in DB...skipping...""".format(
                    tw=tag_word)
        tag_word_to_hashtag_insert_command = """
                INSERT INTO `tag_word_to_hashtag` (tag_word_id, hashtag_id)
                VALUES({wid}, {hid});""".format(wid=tag_word_pk,
                                                hid=hashtag_id)
        if self.show_sql:
            print tag_word_to_hashtag_insert_command
        self.cursor.execute(tag_word_to_hashtag_insert_command)

    def insert_twitter_user(self, user_id, follow_ratio, followers, following,
                            offset):
        """
        Insert data about a twitter user.
        :param user_id: the user's id
        :type user_id: str
        :param follow_ratio: the ratio of users the user follows / users the user is followed by
        :type follow_ratio: float
        :param followers: the number of users following this user
        :type followers: int
        :param following: the number of users this user follows
        :type following: int
        :param offset: the user's offset from UTC
        :type offset: int or NULL
        :param tweet_id: the id of the tweet the user tweeted
        :type tweet_id: str
        :return: None
        """
        try:
            offset = int(offset)
        except TypeError:
            offset = "NULL"
        try:
            user_insert_command = """
                    INSERT INTO `twitter_user` (user_id, follow_ratio, followers, following, offset)
                    VALUES({id}, {ratio}, {ers}, {ing}, {off})
                    ON DUPLICATE KEY UPDATE user_id={id}, follow_ratio={ratio}, followers={ers}, following={ing},
                    offset={off};
                """.format(
                id=user_id,
                ratio=follow_ratio if
                (isinstance(followers, int) and followers >= 0) else "NULL",
                ers=followers if
                (isinstance(followers, int) and followers >= 0) else "NULL",
                ing=following if
                (isinstance(following, int) and following >= 0) else "NULL",
                off=offset)
            if self.show_sql:
                print user_insert_command
            self.cursor.execute(user_insert_command)

        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "User with id {uid} already exists in DB...skipping...".format(
                    uid=user_id)

    def insert_url(self,
                   url,
                   domain,
                   tweet_id=None,
                   user_id=None,
                   user_url=False):
        """
        Insert data about a URL. URLs are found in one of two places: the body of the tweet itself, or in the profile of
        the user who tweeted the tweet. This method dispatches the appropriate method in either situation.
        :param url: the url
        :type url: str
        :param domain: the url's domain
        :type domain: str
        :param tweet_id: the id of the tweet the url appears in
        :type tweet_id: str or None
        :param user_id: the id of the user in whose profile the url occurs
        :type user_id: str or None
        :param user_url: whether the url occurs in a user's profile
        :type user_url: bool
        :return: None
        """
        url_pk = self.get_or_set_pk('url', url)
        if "'" in url:
            url_tpl = '"{url_str}"'
        else:
            url_tpl = "'{url_str}'"
        if tweet_id is not None:
            add_tweet_command = """
                INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})""".format(
                tweet_id=tweet_id)
            if self.show_sql:
                print add_tweet_command
            self.cursor.execute(add_tweet_command)

        if user_id is not None and user_url:
            add_user_command = """
                INSERT IGNORE INTO `twitter_user` (user_id) VALUE({user_id})""".format(
                user_id=user_id)
            if self.show_sql:
                print add_user_command
            self.cursor.execute(add_user_command)

        try:
            url_insert_command = """
                    INSERT INTO `url` (id, str, domain)
                    VALUES({url_id},{url_tpl},'{domain}');""".format(
                url_id=url_pk,
                url_tpl=url_tpl.format(url_str=self.escape_word(url)),
                domain=self.escape_word(domain))
            if self.show_sql:
                print url_insert_command
            self.cursor.execute(url_insert_command)

        except MySQLdb.IntegrityError:
            if self.show_sql:
                print """URL {url} already exists in DB...skipping...""".format(
                    url=self.escape_word(url))
        if user_url:
            self.insert_user_url(user_id, url_pk)
        else:
            self.insert_tweet_url(tweet_id, url_pk)

    def insert_user_url(self, user_id, url_pk):
        """
        Insert user id and url pk into user_to_url table. Don't call this directly.
        :param user_id: the id of the user in whose profile the url occurs
        :type user_id: str
        :param url_pk: the primary key of the url
        :type url_pk: int
        :return: None
        """
        if not user_id:
            raise ValueError('Must specify user_id')
        else:

            user_url_insert_command = """
                INSERT INTO `user_to_url` (user_id, url_id)
                VALUES({user_id},{url_id});""".format(user_id=user_id,
                                                      url_id=url_pk)
            if self.show_sql:
                print user_url_insert_command
            self.cursor.execute(user_url_insert_command)

    def insert_tweet_url(self, tweet_id, url_pk):
        """
        Insert tweet id and url pk into url_to_tweet table. Don't call this directly.
        :param tweet_id:
        :param url_pk:
        :return: None
        """
        if not tweet_id:
            raise ValueError('Must specify tweet_id')
        else:
            tweet_url_insert_command = """
                INSERT INTO `url_to_tweet` (tweet_id, url_id)
                VALUES({tweet_id},{url_id});""".format(tweet_id=tweet_id,
                                                       url_id=url_pk)
            if self.show_sql:
                print tweet_url_insert_command
            self.cursor.execute(tweet_url_insert_command)

    def insert_word(self, word_str, tweet_id):
        """
        Insert data about a word.
        :param word_str: the word
        :type word_str: str
        :param tweet_id: the id of the tweet the word was found in
        :type tweet_id: str
        :return: None
        """
        word_pk = self.get_or_set_pk('word', word_str)
        word_str = word_str.replace("'", "[Q]").replace('"', "[QQ]")
        if "\\" in word_str:
            word_str = word_str.replace("\\", "\\\\")
        add_tweet_command = """
            INSERT IGNORE INTO `tweet` (tweet_id) VALUES({tweet_id})""".format(
            tweet_id=tweet_id)
        if self.show_sql:
            print add_tweet_command
        self.cursor.execute(add_tweet_command)
        try:
            word_insert_command = """
                    INSERT INTO `word` (id, str)
                    VALUES({w_pk}, '{w_str}');""".format(w_pk=word_pk,
                                                         w_str=word_str)
            if self.show_sql:
                print word_insert_command
            self.cursor.execute(word_insert_command)
        except MySQLdb.IntegrityError:
            if self.show_sql:
                print "Word {word} already exists in DB...skipping...".format(
                    word=word_str)
        except MySQLdb.ProgrammingError:
            return None
        word_to_tweet_insert_command = """
                INSERT INTO `word_to_tweet` (word_id, tweet_id) VALUES({wid}, {tid})
                        """.format(wid=word_pk, tid=tweet_id)
        if self.show_sql:
            print word_to_tweet_insert_command
        self.cursor.execute(word_to_tweet_insert_command)

    def process_json(self, js):
        """
        Process a JSON representation of a tweet and insert all the relevant information into the database. Also inserts
        user_to_tweet data.
        :param js: JSON representation of a tweet
        :type js: list
        :return: None
        """
        tweet_id = js[0]
        user_id = js[1][0]['user']
        self.process_tweet(tweet_id, js[1][0])
        self.process_user(user_id, js[1][1])
        user_to_tweet_insert_command = """
                INSERT INTO `user_to_tweet` (user_id, tweet_id, is_mention)
                VALUES({uid}, {tid}, {is_m});""".format(uid=user_id,
                                                        tid=tweet_id,
                                                        is_m=0)
        if self.show_sql:
            print user_to_tweet_insert_command
        self.cursor.execute(user_to_tweet_insert_command)

    def process_tweet(self, tweet_id, tweet_data):
        """
        Process the tweet data.
        :param tweet_id: the tweet's id
        :type tweet_id: str
        :param tweet_data: the data extracted from the tweet
        :type tweet_data: dict
        :return: None
        """
        self.insert_tweet_data(tweet_id, tweet_data['hour'], tweet_data['day'],
                               tweet_data['is_weekday'])
        hashtags = tweet_data.get('hashtags')
        split_hashtags = tweet_data.get('split_hashtags')
        for idx, hashtag in enumerate(hashtags):  # hashtags & split_hashtags
            hashtag_id = self.insert_hashtag(hashtag, tweet_id)
            split_hashtag = split_hashtags[idx]
            for tag_word in split_hashtag:
                self.insert_tag_word(tag_word, hashtag_id)
        urls = tweet_data['urls']
        if urls:  # not all tweet have urls
            domains = tweet_data['domains']
            for idx, url in enumerate(urls):  # urls & domains
                self.insert_url(url, domains[idx], tweet_id=tweet_id)
        for word in tweet_data['words']:  # words
            self.insert_word(word, tweet_id)

    def process_user(self, user_id, user_data):
        """
        Process the user data.
        :param user_id: the user's id
        :type user_id: str
        :param user_data: the data extracted from the user profile
        :return: None
        """
        followers = user_data[
            'followers'] if user_data['followers'] != '-1' else "NULL"
        following = user_data[
            'following'] if user_data['following'] != '-1' else "NULL"
        ratio = user_data['follow_ratio'] if followers != 'NULL' else 'NULL'
        self.insert_twitter_user(user_id, ratio, followers, following,
                                 user_data.get('offset', 'NULL'))
        user_urls = user_data['urls']
        if user_urls:  # not all user profiles have urls
            user_domains = user_data['domains']
            for idx, user_url in enumerate(user_urls):  # urls & domains
                self.insert_url(user_url,
                                user_domains[idx],
                                user_id=user_id,
                                user_url=True)