Esempio n. 1
0
    def wrapper(*args, **kw):
        start_time = time.time()
        value = func(*args, **kw)
        debuglog.msg("\nPERFTEST===============\nFunction:\t%s\nTime:\t%s\n"%
                     (func.__module__ + "::" + func.__name__, str(time.time() - start_time)), mode="debug")

        return value
Esempio n. 2
0
    def wrapper(*args, **kw):
        start_time = time.time()
        value = func(*args, **kw)
        debuglog.msg("\nPERFTEST===============\nFunction:\t%s\nTime:\t%s\n" %
                     (func.__module__ + "::" + func.__name__,
                      str(time.time() - start_time)),
                     mode="debug")

        return value
Esempio n. 3
0
 def fetchUserTweets(self, user):
     """
     Fetch tweets for user and add to the database.
     """
     data = self.getUserTweetsData(user)
     for tweet in data['results']:
         if self.tweet_adder.add(tweet):
            debuglog.msg("successfully added")
         else:
            debuglog.msg("failed to add :(")
     return True
Esempio n. 4
0
 def fetchUserTweets(self, user):
     """
     Fetch tweets for user and add to the database.
     """
     data = self.getUserTweetsData(user)
     for tweet in data['results']:
         if self.tweet_adder.add(tweet):
             debuglog.msg("successfully added")
         else:
             debuglog.msg("failed to add :(")
     return True
Esempio n. 5
0
    def fixTokensInterrupted(self):
        f = open('missing_tweets2.txt')
        missing_tweets = [line.replace('\n', '') for line in f.readlines()]

        q = "SELECT text, from_user, id FROM tweets WHERE id IN("
        vals = {}

        q += ','.join(missing_tweets) + ')'

        results = self.sql.q(q)
        #pprint.pprint(results)
        #return

        failures = []
        f = open('token_fix_failures.txt', 'w')
        for result in results:
            debuglog.msg("Adding tokens for tweet", result[2])
            try:
                self.addTokens({'text': result[0], 'from_user': result[1]})
                self.addTokenMapping({
                    'text': result[0],
                    'from_user': result[1],
                    'id': result[2]
                })
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:", len(failures))
                f.write(result[2] + "\n")

        f.close()

        debuglog.msg(failures)
Esempio n. 6
0
    def checkRateLimit(self):
        debuglog.pprint_msg(self.rate_data)
        debuglog.msg("REMAINING HITS:",self.rate_data['remaining_hits'])
        if time.time() > self.rate_data['reset_time_in_seconds']:
            self.rate_data = self.fetchRateData()

        if "error" in self.rate_data:
            return 60*60
            
        if self.rate_data['remaining_hits'] <= 1:
            debuglog.msg("rate limit: wait",self.rate_data['reset_time_in_seconds'] - time.time() )
            return self.rate_data['reset_time_in_seconds'] - time.time()
        else:
            return 0
Esempio n. 7
0
    def fixTokensInterrupted(self):
        f = open('missing_tweets2.txt')
        missing_tweets = [line.replace('\n','') for line in f.readlines()]

        q = "SELECT text, from_user, id FROM tweets WHERE id IN("
        vals = {}

        q += ','.join(missing_tweets) + ')'

        results = self.sql.q(q)
        #pprint.pprint(results)
        #return

        failures = []
        f = open('token_fix_failures.txt','w')
        for result in results:
            debuglog.msg("Adding tokens for tweet",result[2])
            try:
                self.addTokens({'text':result[0], 'from_user':result[1]})
                self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]})
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:",len(failures))
                f.write(result[2]+"\n")

        f.close()

        debuglog.msg(failures)
Esempio n. 8
0
    def GetTermIDFs(self, terms):
        if not terms or not len(terms):
            return json.loads({"idfs": []})

        url = 'http://50.56.221.228/cgi-bin/idf.php?'
        # TODO: HTML entity encoding (?)
        # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate.
        data = ('terms=' + ','.join(terms).replace("#", "%23")).encode("utf-8")
        debuglog.msg(data)

        txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read())
        txt = txt_unicode.unicode_markup
        txt = txt.replace(",null:", ',"null":')  #workaround
        data = json.loads(txt, encoding=txt_unicode.original_encoding)
        return data
Esempio n. 9
0
    def checkRateLimit(self):
        debuglog.pprint_msg(self.rate_data)
        debuglog.msg("REMAINING HITS:", self.rate_data['remaining_hits'])
        if time.time() > self.rate_data['reset_time_in_seconds']:
            self.rate_data = self.fetchRateData()

        if "error" in self.rate_data:
            return 60 * 60

        if self.rate_data['remaining_hits'] <= 1:
            debuglog.msg("rate limit: wait",
                         self.rate_data['reset_time_in_seconds'] - time.time())
            return self.rate_data['reset_time_in_seconds'] - time.time()
        else:
            return 0
Esempio n. 10
0
    def GetTermIDFs(self, terms):
        if not terms or not len(terms):
            return json.loads({"idfs": []})

        url = "http://50.56.221.228/cgi-bin/idf.php?"
        # TODO: HTML entity encoding (?)
        # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate.
        data = ("terms=" + ",".join(terms).replace("#", "%23")).encode("utf-8")
        debuglog.msg(data)

        txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read())
        txt = txt_unicode.unicode_markup
        txt = txt.replace(",null:", ',"null":')  # workaround
        data = json.loads(txt, encoding=txt_unicode.original_encoding)
        return data
Esempio n. 11
0
 def q(self, query, values=None, auto_str=True):
     try:
         if not values:
             self.cur.execute(query)
         else:
             self.cur.execute(query, values)
         self.conn.commit()
         return self.cur.fetchall()
     except:
         debuglog.msg("Query failed!")
         debuglog.msg("Query:",query)
         if values is not None:
             debuglog.msg("Vals:")
             debuglog.pprint_msg(values)
         #traceback.print_exc(file=sys.stdout)
         debuglog.msg(traceback.format_exc())
         exit
Esempio n. 12
0
 def q(self, query, values=None, auto_str=True):
     try:
         if not values:
             self.cur.execute(query)
         else:
             self.cur.execute(query, values)
         self.conn.commit()
         return self.cur.fetchall()
     except:
         debuglog.msg("Query failed!")
         debuglog.msg("Query:", query)
         if values is not None:
             debuglog.msg("Vals:")
             debuglog.pprint_msg(values)
         #traceback.print_exc(file=sys.stdout)
         debuglog.msg(traceback.format_exc())
         exit
Esempio n. 13
0
    def getUserTweetsData(self,user):        
        debuglog.msg("=====\n\nGetting data for @%s from Search API..."%user)
        try:
            twitter_query = "from:%s"%user
            twitter_query = urllib.parse.quote(twitter_query)

            query_url = "http://search.twitter.com/search.json?lang=en&rpp=100&q=%s"%twitter_query

            response_unicode = UnicodeDammit(urllib.request.urlopen(query_url).read())
            data = json.loads(response_unicode.unicode_markup,
                encoding=response_unicode.original_encoding if response_unicode.original_encoding else "utf-8")
            debuglog.msg("\tGot %s tweets for @%s from Search API."%(str(len(data['results'])), user))
            return data
        except:
            debuglog.msg("\tFailed to get data from Search API :(")
            debuglog.msg("\t\tURL:\t%s"%query_url)
            return { 'results': [] }
Esempio n. 14
0
    def fixTokens(self):
        q = "SELECT text, from_user, id FROM tweets"

        results = self.sql.q(q)
        failures = []
        f = open('token_fix_failures.txt','w')
        for result in results:
            debuglog.msg("Adding tokens for tweet",result[2])
            try:
                self.addTokens({'text':result[0], 'from_user':result[1]})
                self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]})
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:",len(failures))
                f.write(result[2]+"\n")

        f.close()

        debuglog.msg(failures)
Esempio n. 15
0
    def getUserTweetsData(self, user):
        debuglog.msg("=====\n\nGetting data for @%s from Search API..." % user)
        try:
            twitter_query = "from:%s" % user
            twitter_query = urllib.parse.quote(twitter_query)

            query_url = "http://search.twitter.com/search.json?lang=en&rpp=100&q=%s" % twitter_query

            response_unicode = UnicodeDammit(
                urllib.request.urlopen(query_url).read())
            data = json.loads(response_unicode.unicode_markup,
                              encoding=response_unicode.original_encoding if
                              response_unicode.original_encoding else "utf-8")
            debuglog.msg("\tGot %s tweets for @%s from Search API." %
                         (str(len(data['results'])), user))
            return data
        except:
            debuglog.msg("\tFailed to get data from Search API :(")
            debuglog.msg("\t\tURL:\t%s" % query_url)
            return {'results': []}
Esempio n. 16
0
    def fetchTopUserTweets(self, start_at=None):
        debuglog.msg("Fetching all celebrity tweets...")

        q = "SELECT DISTINCT user FROM celebs"
        results = SQLQuery().q(q)
        users = [result[0] for result in results]

        if start_at:
            users = users[users.index(start_at):]

        for user in users:
            if self.fetchUserTweets(user):
                debuglog.msg("\tSuccessfully fetched tweets for @%s :)" % user)
            else:
                debuglog.msg("\tFailed to fetch tweets for @%s :(" % user)
            time.sleep(1)
Esempio n. 17
0
    def fetchTopUserTweets(self, start_at=None):
        debuglog.msg("Fetching all celebrity tweets...")

        q = "SELECT DISTINCT user FROM celebs"
        results = SQLQuery().q(q)
        users = [result[0] for result in results]
        
        if start_at:
            users = users[users.index(start_at):]
            
        for user in users:
            if self.fetchUserTweets(user):
                 debuglog.msg("\tSuccessfully fetched tweets for @%s :)"%user)
            else:
                 debuglog.msg("\tFailed to fetch tweets for @%s :("%user)
            time.sleep(1)
Esempio n. 18
0
    def fixTokens(self):
        q = "SELECT text, from_user, id FROM tweets"

        results = self.sql.q(q)
        failures = []
        f = open('token_fix_failures.txt', 'w')
        for result in results:
            debuglog.msg("Adding tokens for tweet", result[2])
            try:
                self.addTokens({'text': result[0], 'from_user': result[1]})
                self.addTokenMapping({
                    'text': result[0],
                    'from_user': result[1],
                    'id': result[2]
                })
            except:
                failures.append(result[2])
                debuglog.msg("\tAdding tokens failed!")
                debuglog.msg("\tFailures so far:", len(failures))
                f.write(result[2] + "\n")

        f.close()

        debuglog.msg(failures)
Esempio n. 19
0
    def fetchUserTimeline(self, user, format="default", use_cache=True, write_cache=True, use_filesystem_cache=False):
        # TODO: Clean this function up, format parameter does magic it shouldn't be doing.
        # Currently, format="default" means that we're adding celebrity timeline tweets, we never call this.
        # If we do call with format="default" we want to add the timeline tweets to the celebrity tweets table.
        # This is called from DataGrabber to get user timelines with format="searchapi". In this case we want to check
        # if we have matching non-celebrity tweets, and if so return them (in future: possibly add new tweets from
        # search api as well). If not, get tweets from the timeline api, store them in the tweets_non_celeb table,
        # and return an object with those tweets.
        # Also, if a user is cached and called with default, we will just get back the cached data and not insert anything.

        debuglog.msg("Fetching timeline for @%s..."%user)
        got_cache_data = False
        json_txt = "{}"
        json_encoding = "utf-8"

        if use_cache and not use_filesystem_cache:
            q = "SELECT * FROM tweets_non_celeb WHERE from_user=%(user)s;"
            vals = { 'user': user }
            cached_tweets = self.sql.q(q, vals)
            if len(cached_tweets) > 0:
                return [tweet[0] for tweet in cached_tweets]
        elif use_cache and use_filesystem_cache:
            debuglog.msg("\tchecking cache...")
            cached_list = os.listdir('./timelines')
            userjsonfilename = user.lower()+'.json'
            if userjsonfilename in cached_list:
                #modtime = os.stat('./timelines/'+userjsonfilename)[ST_MTIME]
                ##cache stays fresh for a day
                #if ((float(time.time()) - modtime)/60)/60 <= 24:
                debuglog.msg("\t\tgot cache data.")
                json_txt = open('./timelines/'+userjsonfilename,'r').read()
                got_cache_data = True

        if not got_cache_data:
            debuglog.msg("\tNo cache data, calling timeline api...")
            if self.checkRateLimit() > 0:
                debuglog.msg("\t\tHave to wait.")
                return {'status':'wait'}
            url = "https://api.twitter.com/1/statuses/user_timeline.json?&screen_name=%s&count=150"%user
            debuglog.msg(url)
            try:
                response = urllib.request.urlopen(url)
                debuglog.msg(response.info())
            except urllib.error.HTTPError as e:
                if "404" in str(e):
                    return {'status':'404'}
                elif "502" in str(e):
                    return {'status':'retry'}
                else:
                    return {'status':'error'}

            json_unicode = UnicodeDammit(response.read())
            json_txt = json_unicode.unicode_markup
            if json_unicode.original_encoding:
                json_encoding = json_unicode.original_encoding

            if write_cache and use_filesystem_cache:
                fname = './timelines/'+user.lower()+'.json'
                with open(fname,'wt') as f:
                    os.chmod(fname, 0o777)
                    f.write(json_txt)
            
        data = json.loads(json_txt, encoding=json_encoding)
        debuglog.msg("\tdata is...",str(data)[:100])

        if format == "searchapi":
            # For now, format="searchapi" indicates we are getting non-celebrity tweets.
            debuglog.msg("\tGot %d results for %s from user timeline API."%(len(data),user))

            if write_cache and not use_filesystem_cache:
                for non_celeb_timeline_tweet in data:
                    self.tweet_adder.addNonCelebTimelineTweet(non_celeb_timeline_tweet)

            return { 'results':data }

        # For now, format="default" (only way to reach here) means we are adding celebrity tweets.
        for timeline_tweet in data:
            self.tweet_adder.addTimelineTweet(timeline_tweet)
            
        return {'status':'success'}
Esempio n. 20
0
    def fetchUserTimeline(self,
                          user,
                          format="default",
                          use_cache=True,
                          write_cache=True,
                          use_filesystem_cache=False):
        # TODO: Clean this function up, format parameter does magic it shouldn't be doing.
        # Currently, format="default" means that we're adding celebrity timeline tweets, we never call this.
        # If we do call with format="default" we want to add the timeline tweets to the celebrity tweets table.
        # This is called from DataGrabber to get user timelines with format="searchapi". In this case we want to check
        # if we have matching non-celebrity tweets, and if so return them (in future: possibly add new tweets from
        # search api as well). If not, get tweets from the timeline api, store them in the tweets_non_celeb table,
        # and return an object with those tweets.
        # Also, if a user is cached and called with default, we will just get back the cached data and not insert anything.

        debuglog.msg("Fetching timeline for @%s..." % user)
        got_cache_data = False
        json_txt = "{}"
        json_encoding = "utf-8"

        if use_cache and not use_filesystem_cache:
            q = "SELECT * FROM tweets_non_celeb WHERE from_user=%(user)s;"
            vals = {'user': user}
            cached_tweets = self.sql.q(q, vals)
            if len(cached_tweets) > 0:
                return [tweet[0] for tweet in cached_tweets]
        elif use_cache and use_filesystem_cache:
            debuglog.msg("\tchecking cache...")
            cached_list = os.listdir('./timelines')
            userjsonfilename = user.lower() + '.json'
            if userjsonfilename in cached_list:
                #modtime = os.stat('./timelines/'+userjsonfilename)[ST_MTIME]
                ##cache stays fresh for a day
                #if ((float(time.time()) - modtime)/60)/60 <= 24:
                debuglog.msg("\t\tgot cache data.")
                json_txt = open('./timelines/' + userjsonfilename, 'r').read()
                got_cache_data = True

        if not got_cache_data:
            debuglog.msg("\tNo cache data, calling timeline api...")
            if self.checkRateLimit() > 0:
                debuglog.msg("\t\tHave to wait.")
                return {'status': 'wait'}
            url = "https://api.twitter.com/1/statuses/user_timeline.json?&screen_name=%s&count=150" % user
            debuglog.msg(url)
            try:
                response = urllib.request.urlopen(url)
                debuglog.msg(response.info())
            except urllib.error.HTTPError as e:
                if "404" in str(e):
                    return {'status': '404'}
                elif "502" in str(e):
                    return {'status': 'retry'}
                else:
                    return {'status': 'error'}

            json_unicode = UnicodeDammit(response.read())
            json_txt = json_unicode.unicode_markup
            if json_unicode.original_encoding:
                json_encoding = json_unicode.original_encoding

            if write_cache and use_filesystem_cache:
                fname = './timelines/' + user.lower() + '.json'
                with open(fname, 'wt') as f:
                    os.chmod(fname, 0o777)
                    f.write(json_txt)

        data = json.loads(json_txt, encoding=json_encoding)
        debuglog.msg("\tdata is...", str(data)[:100])

        if format == "searchapi":
            # For now, format="searchapi" indicates we are getting non-celebrity tweets.
            debuglog.msg("\tGot %d results for %s from user timeline API." %
                         (len(data), user))

            if write_cache and not use_filesystem_cache:
                for non_celeb_timeline_tweet in data:
                    self.tweet_adder.addNonCelebTimelineTweet(
                        non_celeb_timeline_tweet)

            return {'results': data}

        # For now, format="default" (only way to reach here) means we are adding celebrity tweets.
        for timeline_tweet in data:
            self.tweet_adder.addTimelineTweet(timeline_tweet)

        return {'status': 'success'}
Esempio n. 21
0
    def fetchTopUserTimelines(self):
        top_users = open('update_users.txt', 'r').readlines()
        top_users = [user.replace('\n', '') for user in top_users]

        for user in top_users:
            debuglog.msg("Getting timeline for", user)
            status = 'retry'
            while status == 'retry' or status == 'wait':
                debuglog.msg(status)
                debuglog.msg("\tFetching timeline for @%s in %s seconds..." %
                             (user, str(self.checkRateLimit())))
                status = self.fetchUserTimeline(user)['status']
                time.sleep(1)
                time.sleep(self.checkRateLimit())

            if status == 'success':
                debuglog.msg("\tGot timeline for %s :)" % user)
            elif status == '404':
                debuglog.msg("\tUser not found.")
            else:
                debuglog.msg(
                    "\tUnknown error prevented getting user timeline.")
Esempio n. 22
0
    def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"):
        """
        Adds a tweet to tweet_table (celebrity tweet table by default).
        Tweet must be in the format provided by Search API.
        """

        if not self.ids:
            self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")]

        debuglog.msg("Inserting tweet", tweet['id'])
        #debuglog.pprint_msg(tweet)

        if not created_at_is_obj:
            dt = datetime.datetime.strptime(replaceMonth(tweet['created_at'][5:25]),"%d %m %Y %H:%M:%S")
        else:
            dt = tweet['created_at']
            
        created_at = dt.strftime("%Y-%m-%d %H:%M:%S")
        
        dicvals = {'created_at':created_at,
                   'from_user':tweet['from_user'],
                   'from_user_id':tweet['from_user_id'],
                   'from_user_name':tweet['from_user_name'],
                   'geo':str(tweet['geo']),
                   'id':tweet['id'],
                   'iso_language_code':tweet['iso_language_code'],
                   'metadata':str(tweet['metadata']),
                   'profile_image_url':tweet['profile_image_url'],
                   'source':tweet['source'],
                   'text':tweet['text'],
                   'to_user':tweet['to_user'],
                   'to_user_id':tweet['to_user_id'],
                   'to_user_name':tweet['to_user_name']}


        dicq= "INSERT IGNORE INTO " + tweet_table

        dicq += """ VALUES(%(created_at)s,
                           %(from_user)s,
                           %(from_user_id)s,
                           %(from_user_name)s,
                           %(geo)s,
                           %(id)s,
                           %(iso_language_code)s,
                           %(metadata)s,
                           %(profile_image_url)s,
                           %(source)s,
                           %(text)s,
                           %(to_user)s,
                           %(to_user_id)s,
                           %(to_user_name)s)"""
        
        if tweet['id'] not in self.ids:
            succeeded = False
            try:
                self.sql.q(dicq,dicvals)
                succeeded = True
            except UnicodeEncodeError:
                try:
                    debuglog.msg("\tUNIDECODE ERROR, trying decode...")
                    for k in dicvals:
                        dicvals[k] = unidecode(dicvals[k])
                    self.sql.q(dicq,dicvals)
                    succeeded = True
                except:
                    debuglog.msg("\tUnidecode failed :(")

            
            if succeeded and tweet_table == 'tweets':
                tokens = self.tfidf_obj.get_tokens(tweet['text'])
                self.addTokens(tweet,tokens)
                self.addTokenMapping(tweet, tokens)

            return succeeded

        debuglog.msg("\ttweet already existed")
        return False
Esempio n. 23
0
def getCelebMatches(userdata, celebstats):
    #FIRST CALCULATE BASE STATS FOR USER
    #declare all counts
    num_days = 0.0

    num_per_time = [0.0] * 6

    num_per_weekday = [0.0] * 7
    num_at = 0.0
    num_rt = 0.0
    num_hash = 0.0
    num_links = 0.0

    mentions = []
    hashes = []

    user_name = ""
    #tweets per day logic depends on results coming back in chronological order
    #MAKE SURE THIS IS ALWAYS THE CASE IN SEARCH API, REST API, CACHE
    #IF NOT, NEED TO SORT AS PRE PROCESSING STEP
    #JON NEEDS TO PUT IN A SIMILAR CHECK, NON EXISTENT USERS & USERS W 0 TWEETS CRASH CODE CURRENTLY
    if len(userdata['results']) == 0 or type(userdata['results']) is dict:
        return {}

    else:
        if (len(userdata['results']) > 0):
            created_at = tweetadder.replaceMonth(
                userdata['results'][0]['created_at'])
            cur_datetime = datetime.datetime(int(created_at[25:]),
                                             int(created_at[4:6]),
                                             int(created_at[7:9]),
                                             int(created_at[10:12]),
                                             int(created_at[13:15]),
                                             int(created_at[16:18]))
            num_days += 1
            user_name = userdata['results'][0]["user"]["screen_name"]

        for tweet in userdata['results']:
            created_at = tweetadder.replaceMonth(tweet['created_at'])
            created = datetime.datetime(int(created_at[25:]),
                                        int(created_at[4:6]),
                                        int(created_at[7:9]),
                                        int(created_at[10:12]),
                                        int(created_at[13:15]),
                                        int(created_at[16:18]))

            text = tweet['text']

            #update day count
            if created.day != cur_datetime.day or created.month != cur_datetime.month or created.year != cur_datetime.year:
                cur_datetime = created
                num_days += 1

            #update num_per_time count
            num_per_time[math.floor(created.hour / 4)] += 1

            #update num_per_weekday count
            num_per_weekday[created.weekday()] += 1

            #Get RT @ and # counts
            link = False
            mention = False
            rt = False
            has = False
            for word in text.split(" "):
                if "http://" in word and not link:
                    num_links += 1
                    link = True

                if len(word) > 0 and word[0] == "@" and word[1:] != user_name:
                    mentions.append(word)
                    if not mention:
                        num_at += 1
                        mention = True

                if "RT" == word and not rt:
                    num_rt += 1
                    rt = True

                if len(word) > 0 and word[0] == "#":
                    hashes.append(word)
                    if not has:
                        num_hash += 1
                        has = True

        mention_count = collections.Counter(mentions)
        unique_mentions = -1.0
        if len(mentions) != 0:
            unique_mentions = float(len(mention_count)) / len(mentions)

        hash_count = collections.Counter(hashes)
        unique_hashes = -1.0
        if len(hashes) != 0:
            unique_hashes = float(len(hash_count)) / len(hashes)

        total_tweets = len(userdata['results'])
        userstats = {}
        if total_tweets != 0:
            userstats = {
                "tr_day": float(total_tweets) / num_days,
                "tr_monday": num_per_weekday[0] / total_tweets,
                "tr_tuesday": num_per_weekday[1] / total_tweets,
                "tr_wednesday": num_per_weekday[2] / total_tweets,
                "tr_thursday": num_per_weekday[3] / total_tweets,
                "tr_friday": num_per_weekday[4] / total_tweets,
                "tr_saturday": num_per_weekday[5] / total_tweets,
                "tr_sunday": num_per_weekday[6] / total_tweets,
                "tr_latenight": num_per_time[0] / total_tweets,
                "tr_earlymorning": num_per_time[1] / total_tweets,
                "tr_morning": num_per_time[2] / total_tweets,
                "tr_afternoon": num_per_time[3] / total_tweets,
                "tr_evening": num_per_time[4] / total_tweets,
                "tr_night": num_per_time[5] / total_tweets,
                "mention_rate": float(num_at) / total_tweets,
                "retweet_rate": float(num_rt) / total_tweets,
                "hash_rate": float(num_hash) / total_tweets,
                "link_rate": float(num_links) / total_tweets,
                "unique_hash": unique_hashes,
                "unique_mention": unique_mentions,
                "user": user_name
            }

    #calculate percentile stats for user
    N = 0
    B_m = 0
    E_m = 0
    B_h = 0
    E_h = 0
    B_l = 0
    E_l = 0
    B_um = 0
    E_um = 0
    B_uh = 0
    E_uh = 0

    for celeb in celebstats:
        #counts for percentiles
        N += 1
        if celeb[14] < userstats["mention_rate"]:
            B_m += 1
        elif celeb[14] == userstats["mention_rate"]:
            E_m += 1

        if celeb[16] < userstats["hash_rate"]:
            B_h += 1
        elif celeb[16] == userstats["hash_rate"]:
            E_h += 1

        if celeb[17] < userstats["link_rate"]:
            B_l += 1
        elif celeb[17] == userstats["link_rate"]:
            E_l += 1

        if celeb[18] < userstats["unique_hash"]:
            B_uh += 1
        elif celeb[18] == userstats["unique_hash"]:
            E_uh += 1

        if celeb[19] < userstats["unique_mention"]:
            B_um += 1
        elif celeb[19] == userstats["unique_mention"]:
            E_um += 1

    #CALCULATE PERCENTILES
    P_m = ((B_m + 0.5 * E_m) / N) * 100
    P_h = ((B_h + 0.5 * E_h) / N) * 100
    P_l = ((B_l + 0.5 * E_l) / N) * 100
    P_um = ((B_um + 0.5 * E_um) / N) * 100
    P_uh = ((B_uh + 0.5 * E_uh) / N) * 100

    #use all info about user to get personality
    if P_l > float(50.0):
        dim_2 = "S"
    else:
        debuglog.msg(P_l)
        dim_2 = "C"

    if (P_m + P_um) / float(2.0) > float(50.0):
        dim_3 = "W"
    else:
        dim_3 = "T"

    if (P_h + P_uh) / float(2.0) > float(50.0):
        dim_4 = "J"
    else:
        dim_4 = "M"

    #get average tweet rates
    m_fri = sum([
        userstats["tr_monday"], userstats["tr_tuesday"],
        userstats["tr_wednesday"], userstats["tr_thursday"],
        userstats["tr_friday"]
    ]) / 5.0
    sa_su = sum([userstats["tr_saturday"], userstats["tr_sunday"]]) / 2.0
    day = sum([userstats["tr_morning"], userstats["tr_afternoon"]]) / 2.0
    night = sum([
        userstats["tr_earlymorning"], userstats["tr_latenight"],
        userstats["tr_evening"], userstats["tr_night"]
    ]) / 4.0

    #avg tweet rate on time off
    tr_weekend = (sa_su + night) / 2.0
    tr_weekday = ((m_fri + day) / 2.0) * 1.3

    if tr_weekday > tr_weekend:
        dim_1 = "A"
    else:
        dim_1 = "E"

    #GO THROUGH LIST OF CELEBS AGAIN AND FIND THOSE WITH SAME PERSONALITY
    matches = []

    for celeb in celebstats:
        if celeb[26] == dim_1 and celeb[27] == dim_2 and celeb[
                28] == dim_3 and celeb[29] == dim_4:
            matches.append(celeb[20])

    toreturn = [dim_1 + dim_2 + dim_3 + dim_4]
    random.shuffle(matches)
    if len(matches) > 24:
        toreturn.append(matches[0:24])
    else:
        toreturn.append(matches[0:len(matches)])

    return toreturn
Esempio n. 24
0
    def GetCelebMatchesForUser(self, user):
        """
        Generate object with information about user and matches with celeb (including matching tweets) to pass to the
        UI.

        TODO: Break this into smaller functions, it's way too big.
        """

        results = {
            "user": {"screen_name": user, "name": "", "pic_url": "", "personality": ""},
            "celeb_matches": [],
            "celeb_matches_pers": [],
        }

        # GET USER TWEETS
        user_data = self.GetUserTweets(user)

        # return an error if user doesn't exist/has no tweets.
        if user_data is None or not len(user_data["results"]):
            results["status"] = "error"
            return results

        results["user"]["name"] = user_data["results"][0]["user"]["name"]
        results["user"]["pic_url"] = user_data["results"][0]["user"]["profile_image_url"]

        # Pass user_data and celeb_stats to get celeb matches
        celeb_stats = self.GetCelebTweetStats()
        celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats)

        celebs = celeb_matches[1]
        results["user"]["personality"] = celeb_matches[0]

        # get pic urls for celeb pers matches
        if len(celebs) > 0:
            q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"token" + str(count)] = celeb
                q = q + "%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user"

            q_results = self.sql.q(q, vals)
            celeb_match_pers_array = []
            for res in q_results:
                celeb_match_pers_array.append([res[0], res[1]])
            results["celeb_matches_pers"] = celeb_match_pers_array

        # GET USER TFIDF
        user_tfidf = self.GetUserTFIDFs(user_data)
        user_scores = user_tfidf["scores_dic"]

        debuglog.msg("top user terms are", user_tfidf["scores_list"][:15])

        # GET CELEBS TFIDF
        celeb_scores = self.GetCelebTFIDFsForTerms([term[0] for term in user_tfidf["scores_list"]][:15])

        # CALCULATE MATCH SCORES
        cumulative_celeb_scores = {}
        celeb_words = {}
        for entry in celeb_scores:
            celeb = entry[0]

            if celeb.lower() == user.lower():
                continue

            token = unidecode.unidecode(entry[1])
            score = float(entry[2])

            if celeb in cumulative_celeb_scores:
                celeb_words[celeb][token] = score
                cumulative_celeb_scores[celeb] += user_scores[token] * score
            else:
                celeb_words[celeb] = {token: score}
                cumulative_celeb_scores[celeb] = user_scores[token] * score

        matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb]) for celeb in cumulative_celeb_scores]
        matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]])

        # FIND MATCHING TWEETS FOR TOP 10 CELEBS
        for top_10_celeb_index in range(min(10, len(matches))):
            celeb_match = {
                "screen_name": matches[top_10_celeb_index][0],
                "name": "",
                "pic_url": "",
                "match_score": cumulative_celeb_scores[matches[top_10_celeb_index][0]],
                #'top_words' : matches[top_10_celeb_index][2],
                "top_words": {},
                "tweets": [],
            }

            # vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])}
            # q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)"

            q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN ("
            vals = {"celeb": matches[top_10_celeb_index][0]}

            count = 0
            for token in list(matches[top_10_celeb_index][2].keys()):
                # TODO Clean up stopword filtering
                if token.lower() not in self.stopwords:
                    vals["token" + str(count)] = token
                    q += "%(token" + str(count) + ")s, "
                    count += 1

            # trim last comma and space.
            if count:
                q = q[: len(q) - 2]

            q += ")) as t WHERE tweets.id=t.tweet_id;"
            q_results = self.sql.q(q, vals)

            # skip if we don't have any matching celeb tweets.
            if not q_results or not len(q_results):
                continue

            celeb_match["name"] = q_results[0][2]
            celeb_match["pic_url"] = q_results[0][3]
            matching_celeb_tweets = [{"text": result[0], "id": result[1]} for result in q_results]

            matches[top_10_celeb_index] = list(matches[top_10_celeb_index])

            # ADD TWEETS THAT MATCH ON TOKENS
            sorted_tokens = [
                token
                for token in sorted(
                    matches[top_10_celeb_index][2].keys(), key=lambda x: -matches[top_10_celeb_index][2][x]
                )
            ]
            # TODO Clean up stopword filtering
            for token in list(filter(lambda x: x not in self.stopwords, sorted_tokens)):
                celeb_tweets_for_token = list(
                    filter(lambda x: x["text"].lower().count(token.lower()) > 0, matching_celeb_tweets)
                )
                user_tweets_for_token = [
                    user_tfidf["tweets"][user_tfidf["token_mapping"][token][k]]
                    for k in range(len(user_tfidf["token_mapping"][token]))
                ]

                if len(celeb_tweets_for_token) or len(user_tweets_for_token):
                    celeb_match["top_words"][token] = matches[top_10_celeb_index][2][token]

                for matching_tweets_for_token_index in range(
                    min(len(celeb_tweets_for_token), len(user_tweets_for_token))
                ):
                    celeb_match["tweets"].append(
                        {
                            "word": token,
                            "user_tweet": {
                                "url": "http://twitter.com/"
                                + user_tweets_for_token[matching_tweets_for_token_index]["user"]["screen_name"]
                                + "/status/"
                                + str(user_tweets_for_token[matching_tweets_for_token_index]["id"]),
                                "text": user_tweets_for_token[matching_tweets_for_token_index]["text"],
                            },
                            "celeb_tweet": {
                                "url": "http://twitter.com/"
                                + celeb_match["screen_name"]
                                + "/status/"
                                + str(celeb_tweets_for_token[matching_tweets_for_token_index]["id"]),
                                "text": celeb_tweets_for_token[matching_tweets_for_token_index]["text"],
                            },
                        }
                    )

            # matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets})

            if len(celeb_match["tweets"]):
                results["celeb_matches"].append(celeb_match)

        results["status"] = "ok"
        results["permalink_id"] = self.StorePermalink(results)
        return results
Esempio n. 25
0
    def Generate(self):
        self.GenerateDocFreqsTable()

        celeb_count = self.GetCelebCount()

        tokens = self.GetTokens()

        q = """SELECT token_counts.user, token_counts.c, COUNT(token_user_mapping.user) as total_user_tokens
                  FROM
                    (SELECT t.user, t.token, t.c
                      FROM
                        (SELECT user, token, COUNT(*) as c
                          FROM token_user_mapping
                          WHERE token=%(token)s
                          GROUP BY user) as t
                        ORDER BY t.user) as token_counts, token_user_mapping
                  WHERE token_user_mapping.user = token_counts.user GROUP BY token_user_mapping.user"""

        # ITERATE THROUGH TOKENS
        for token in tokens:
            if len(token) < 3:
                debuglog.msg("token %s too short." % token)
                continue
            elif token[0] == '@':
                debuglog.msg("ignoring user token %s" % token)
                continue

            debuglog.msg("Generating tfidf table for token <%s>" % token)
            vals = {'token': token}
            results = self.sql.q(q, vals)

            if results is None:
                continue

            # CALCULATE SCORES
            celebs_with_term = len(results)
            celeb_scores = {}
            for result in results:
                celeb = result[0]
                term_count_for_celeb = result[1]
                total_tokens_for_celeb = result[2]
                celeb_tfidf_for_term = float((Decimal(term_count_for_celeb) / Decimal(total_tokens_for_celeb)) * \
                                       (Decimal(celeb_count) / Decimal(celebs_with_term)))
                celeb_scores[celeb] = (celeb_tfidf_for_term,
                                       term_count_for_celeb)

            # GENERATE QUERY
            insert_q = "INSERT INTO celeb_tfidf_all (user, token, score, count) VALUES"

            count = 0
            vals = {'token': token}

            for celeb in celeb_scores:
                vals['celeb' + str(count)] = celeb
                vals['score' + str(count)] = str(celeb_scores[celeb][0])
                vals['count' + str(count)] = str(celeb_scores[celeb][1])
                insert_q += "(%(celeb" + str(
                    count) + ")s, %(token)s, %(score" + str(
                        count) + ")s, %(count" + str(count) + ")s),"
                count += 1

            if len(vals.keys()) >= 4:
                # Remove last comma and add rule for duplicate keys.
                insert_q = insert_q[:len(
                    insert_q
                ) - 1] + " ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count);"

                # EXECUTE QUERY
                self.sql.q(insert_q, vals)

            # move high-scoring words over celeb_tfidf
            q = "INSERT INTO celeb_tfidf (SELECT * FROM celeb_tfidf_all WHERE score > 0.005) ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count)"
            self.sql.q(q)
Esempio n. 26
0
    def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"):
        """
        Adds a tweet to tweet_table (celebrity tweet table by default).
        Tweet must be in the format provided by Search API.
        """

        if not self.ids:
            self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")]

        debuglog.msg("Inserting tweet", tweet['id'])
        #debuglog.pprint_msg(tweet)

        if not created_at_is_obj:
            dt = datetime.datetime.strptime(
                replaceMonth(tweet['created_at'][5:25]), "%d %m %Y %H:%M:%S")
        else:
            dt = tweet['created_at']

        created_at = dt.strftime("%Y-%m-%d %H:%M:%S")

        dicvals = {
            'created_at': created_at,
            'from_user': tweet['from_user'],
            'from_user_id': tweet['from_user_id'],
            'from_user_name': tweet['from_user_name'],
            'geo': str(tweet['geo']),
            'id': tweet['id'],
            'iso_language_code': tweet['iso_language_code'],
            'metadata': str(tweet['metadata']),
            'profile_image_url': tweet['profile_image_url'],
            'source': tweet['source'],
            'text': tweet['text'],
            'to_user': tweet['to_user'],
            'to_user_id': tweet['to_user_id'],
            'to_user_name': tweet['to_user_name']
        }

        dicq = "INSERT IGNORE INTO " + tweet_table

        dicq += """ VALUES(%(created_at)s,
                           %(from_user)s,
                           %(from_user_id)s,
                           %(from_user_name)s,
                           %(geo)s,
                           %(id)s,
                           %(iso_language_code)s,
                           %(metadata)s,
                           %(profile_image_url)s,
                           %(source)s,
                           %(text)s,
                           %(to_user)s,
                           %(to_user_id)s,
                           %(to_user_name)s)"""

        if tweet['id'] not in self.ids:
            succeeded = False
            try:
                self.sql.q(dicq, dicvals)
                succeeded = True
            except UnicodeEncodeError:
                try:
                    debuglog.msg("\tUNIDECODE ERROR, trying decode...")
                    for k in dicvals:
                        dicvals[k] = unidecode(dicvals[k])
                    self.sql.q(dicq, dicvals)
                    succeeded = True
                except:
                    debuglog.msg("\tUnidecode failed :(")

            if succeeded and tweet_table == 'tweets':
                tokens = self.tfidf_obj.get_tokens(tweet['text'])
                self.addTokens(tweet, tokens)
                self.addTokenMapping(tweet, tokens)

            return succeeded

        debuglog.msg("\ttweet already existed")
        return False
Esempio n. 27
0
    def GetCelebMatchesForUser(self, user):
        """
        Generate object with information about user and matches with celeb (including matching tweets) to pass to the
        UI.

        TODO: Break this into smaller functions, it's way too big.
        """

        results = {
            'user': {
                'screen_name': user,
                'name': '',
                'pic_url': '',
                'personality': ''
            },
            'celeb_matches': [],
            'celeb_matches_pers': []
        }

        # GET USER TWEETS
        user_data = self.GetUserTweets(user)

        # return an error if user doesn't exist/has no tweets.
        if user_data is None or not len(user_data['results']):
            results['status'] = 'error'
            return results

        results['user']['name'] = user_data['results'][0]['user']['name']
        results['user']['pic_url'] = user_data['results'][0]['user'][
            'profile_image_url']

        #Pass user_data and celeb_stats to get celeb matches
        celeb_stats = self.GetCelebTweetStats()
        celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats)

        celebs = celeb_matches[1]
        results['user']['personality'] = celeb_matches[0]

        #get pic urls for celeb pers matches
        if len(celebs) > 0:
            q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user"

            q_results = self.sql.q(q, vals)
            celeb_match_pers_array = []
            for res in q_results:
                celeb_match_pers_array.append([res[0], res[1]])
            results['celeb_matches_pers'] = celeb_match_pers_array

        # GET USER TFIDF
        user_tfidf = self.GetUserTFIDFs(user_data)
        user_scores = user_tfidf['scores_dic']

        debuglog.msg("top user terms are", user_tfidf['scores_list'][:15])

        # GET CELEBS TFIDF
        celeb_scores = self.GetCelebTFIDFsForTerms(
            [term[0] for term in user_tfidf['scores_list']][:15])

        # CALCULATE MATCH SCORES
        cumulative_celeb_scores = {}
        celeb_words = {}
        for entry in celeb_scores:
            celeb = entry[0]

            if celeb.lower() == user.lower():
                continue

            token = unidecode.unidecode(entry[1])
            score = float(entry[2])

            if celeb in cumulative_celeb_scores:
                celeb_words[celeb][token] = score
                cumulative_celeb_scores[celeb] += user_scores[token] * score
            else:
                celeb_words[celeb] = {token: score}
                cumulative_celeb_scores[celeb] = user_scores[token] * score

        matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb])
                   for celeb in cumulative_celeb_scores]
        matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]])

        # FIND MATCHING TWEETS FOR TOP 10 CELEBS
        for top_10_celeb_index in range(min(10, len(matches))):
            celeb_match = {
                'screen_name':
                matches[top_10_celeb_index][0],
                'name':
                '',
                'pic_url':
                '',
                'match_score':
                cumulative_celeb_scores[matches[top_10_celeb_index][0]],
                #'top_words' : matches[top_10_celeb_index][2],
                'top_words': {},
                'tweets': []
            }

            #vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])}
            #q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)"

            q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN ("
            vals = {'celeb': matches[top_10_celeb_index][0]}

            count = 0
            for token in list(matches[top_10_celeb_index][2].keys()):
                # TODO Clean up stopword filtering
                if token.lower() not in self.stopwords:
                    vals['token' + str(count)] = token
                    q += '%(token' + str(count) + ')s, '
                    count += 1

            # trim last comma and space.
            if count:
                q = q[:len(q) - 2]

            q += ")) as t WHERE tweets.id=t.tweet_id;"
            q_results = self.sql.q(q, vals)

            # skip if we don't have any matching celeb tweets.
            if not q_results or not len(q_results):
                continue

            celeb_match['name'] = q_results[0][2]
            celeb_match['pic_url'] = q_results[0][3]
            matching_celeb_tweets = [{
                'text': result[0],
                'id': result[1]
            } for result in q_results]

            matches[top_10_celeb_index] = list(matches[top_10_celeb_index])

            # ADD TWEETS THAT MATCH ON TOKENS
            sorted_tokens = [
                token for token in sorted(
                    matches[top_10_celeb_index][2].keys(),
                    key=lambda x: -matches[top_10_celeb_index][2][x])
            ]
            # TODO Clean up stopword filtering
            for token in list(
                    filter(lambda x: x not in self.stopwords, sorted_tokens)):
                celeb_tweets_for_token = list(
                    filter(
                        lambda x: x['text'].lower().count(token.lower()) > 0,
                        matching_celeb_tweets))
                user_tweets_for_token = [
                    user_tfidf['tweets'][user_tfidf['token_mapping'][token][k]]
                    for k in range(len(user_tfidf['token_mapping'][token]))
                ]

                if len(celeb_tweets_for_token) or len(user_tweets_for_token):
                    celeb_match['top_words'][token] = matches[
                        top_10_celeb_index][2][token]

                for matching_tweets_for_token_index in range(
                        min(len(celeb_tweets_for_token),
                            len(user_tweets_for_token))):
                    celeb_match['tweets'].append({
                        'word': token,
                        'user_tweet': {
                            'url':
                            'http://twitter.com/' + user_tweets_for_token[
                                matching_tweets_for_token_index]['user']
                            ['screen_name'] + '/status/' +
                            str(user_tweets_for_token[
                                matching_tweets_for_token_index]['id']),
                            'text':
                            user_tweets_for_token[
                                matching_tweets_for_token_index]['text']
                        },
                        'celeb_tweet': {
                            'url':
                            'http://twitter.com/' +
                            celeb_match['screen_name'] + '/status/' +
                            str(celeb_tweets_for_token[
                                matching_tweets_for_token_index]['id']),
                            'text':
                            celeb_tweets_for_token[
                                matching_tweets_for_token_index]['text']
                        }
                    })

            #matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets})

            if len(celeb_match['tweets']):
                results['celeb_matches'].append(celeb_match)

        results['status'] = 'ok'
        results['permalink_id'] = self.StorePermalink(results)
        return results
Esempio n. 28
0
    def fetchTopUserTimelines(self):
        top_users = open('update_users.txt','r').readlines()
        top_users = [user.replace('\n','') for user in top_users]

        for user in top_users:
            debuglog.msg("Getting timeline for",user)
            status='retry'
            while status == 'retry' or status=='wait':
                debuglog.msg(status)
                debuglog.msg("\tFetching timeline for @%s in %s seconds..."%(user, str(self.checkRateLimit())))
                status = self.fetchUserTimeline(user)['status']
                time.sleep(1)
                time.sleep(self.checkRateLimit())
                
            if status == 'success':
                debuglog.msg("\tGot timeline for %s :)"%user)
            elif status == '404':
                debuglog.msg("\tUser not found.")
            else:
                debuglog.msg("\tUnknown error prevented getting user timeline.")
Esempio n. 29
0
def getCelebMatches(userdata, celebstats):
    # FIRST CALCULATE BASE STATS FOR USER
    # declare all counts
    num_days = 0.0

    num_per_time = [0.0] * 6

    num_per_weekday = [0.0] * 7
    num_at = 0.0
    num_rt = 0.0
    num_hash = 0.0
    num_links = 0.0

    mentions = []
    hashes = []

    user_name = ""
    # tweets per day logic depends on results coming back in chronological order
    # MAKE SURE THIS IS ALWAYS THE CASE IN SEARCH API, REST API, CACHE
    # IF NOT, NEED TO SORT AS PRE PROCESSING STEP
    # JON NEEDS TO PUT IN A SIMILAR CHECK, NON EXISTENT USERS & USERS W 0 TWEETS CRASH CODE CURRENTLY
    if len(userdata["results"]) == 0 or type(userdata["results"]) is dict:
        return {}

    else:
        if len(userdata["results"]) > 0:
            created_at = tweetadder.replaceMonth(userdata["results"][0]["created_at"])
            cur_datetime = datetime.datetime(
                int(created_at[25:]),
                int(created_at[4:6]),
                int(created_at[7:9]),
                int(created_at[10:12]),
                int(created_at[13:15]),
                int(created_at[16:18]),
            )
            num_days += 1
            user_name = userdata["results"][0]["user"]["screen_name"]

        for tweet in userdata["results"]:
            created_at = tweetadder.replaceMonth(tweet["created_at"])
            created = datetime.datetime(
                int(created_at[25:]),
                int(created_at[4:6]),
                int(created_at[7:9]),
                int(created_at[10:12]),
                int(created_at[13:15]),
                int(created_at[16:18]),
            )

            text = tweet["text"]

            # update day count
            if (
                created.day != cur_datetime.day
                or created.month != cur_datetime.month
                or created.year != cur_datetime.year
            ):
                cur_datetime = created
                num_days += 1

            # update num_per_time count
            num_per_time[math.floor(created.hour / 4)] += 1

            # update num_per_weekday count
            num_per_weekday[created.weekday()] += 1

            # Get RT @ and # counts
            link = False
            mention = False
            rt = False
            has = False
            for word in text.split(" "):
                if "http://" in word and not link:
                    num_links += 1
                    link = True

                if len(word) > 0 and word[0] == "@" and word[1:] != user_name:
                    mentions.append(word)
                    if not mention:
                        num_at += 1
                        mention = True

                if "RT" == word and not rt:
                    num_rt += 1
                    rt = True

                if len(word) > 0 and word[0] == "#":
                    hashes.append(word)
                    if not has:
                        num_hash += 1
                        has = True

        mention_count = collections.Counter(mentions)
        unique_mentions = -1.0
        if len(mentions) != 0:
            unique_mentions = float(len(mention_count)) / len(mentions)

        hash_count = collections.Counter(hashes)
        unique_hashes = -1.0
        if len(hashes) != 0:
            unique_hashes = float(len(hash_count)) / len(hashes)

        total_tweets = len(userdata["results"])
        userstats = {}
        if total_tweets != 0:
            userstats = {
                "tr_day": float(total_tweets) / num_days,
                "tr_monday": num_per_weekday[0] / total_tweets,
                "tr_tuesday": num_per_weekday[1] / total_tweets,
                "tr_wednesday": num_per_weekday[2] / total_tweets,
                "tr_thursday": num_per_weekday[3] / total_tweets,
                "tr_friday": num_per_weekday[4] / total_tweets,
                "tr_saturday": num_per_weekday[5] / total_tweets,
                "tr_sunday": num_per_weekday[6] / total_tweets,
                "tr_latenight": num_per_time[0] / total_tweets,
                "tr_earlymorning": num_per_time[1] / total_tweets,
                "tr_morning": num_per_time[2] / total_tweets,
                "tr_afternoon": num_per_time[3] / total_tweets,
                "tr_evening": num_per_time[4] / total_tweets,
                "tr_night": num_per_time[5] / total_tweets,
                "mention_rate": float(num_at) / total_tweets,
                "retweet_rate": float(num_rt) / total_tweets,
                "hash_rate": float(num_hash) / total_tweets,
                "link_rate": float(num_links) / total_tweets,
                "unique_hash": unique_hashes,
                "unique_mention": unique_mentions,
                "user": user_name,
            }

    # calculate percentile stats for user
    N = 0
    B_m = 0
    E_m = 0
    B_h = 0
    E_h = 0
    B_l = 0
    E_l = 0
    B_um = 0
    E_um = 0
    B_uh = 0
    E_uh = 0

    for celeb in celebstats:
        # counts for percentiles
        N += 1
        if celeb[14] < userstats["mention_rate"]:
            B_m += 1
        elif celeb[14] == userstats["mention_rate"]:
            E_m += 1

        if celeb[16] < userstats["hash_rate"]:
            B_h += 1
        elif celeb[16] == userstats["hash_rate"]:
            E_h += 1

        if celeb[17] < userstats["link_rate"]:
            B_l += 1
        elif celeb[17] == userstats["link_rate"]:
            E_l += 1

        if celeb[18] < userstats["unique_hash"]:
            B_uh += 1
        elif celeb[18] == userstats["unique_hash"]:
            E_uh += 1

        if celeb[19] < userstats["unique_mention"]:
            B_um += 1
        elif celeb[19] == userstats["unique_mention"]:
            E_um += 1

    # CALCULATE PERCENTILES
    P_m = ((B_m + 0.5 * E_m) / N) * 100
    P_h = ((B_h + 0.5 * E_h) / N) * 100
    P_l = ((B_l + 0.5 * E_l) / N) * 100
    P_um = ((B_um + 0.5 * E_um) / N) * 100
    P_uh = ((B_uh + 0.5 * E_uh) / N) * 100

    # use all info about user to get personality
    if P_l > float(50.0):
        dim_2 = "S"
    else:
        debuglog.msg(P_l)
        dim_2 = "C"

    if (P_m + P_um) / float(2.0) > float(50.0):
        dim_3 = "W"
    else:
        dim_3 = "T"

    if (P_h + P_uh) / float(2.0) > float(50.0):
        dim_4 = "J"
    else:
        dim_4 = "M"

    # get average tweet rates
    m_fri = (
        sum(
            [
                userstats["tr_monday"],
                userstats["tr_tuesday"],
                userstats["tr_wednesday"],
                userstats["tr_thursday"],
                userstats["tr_friday"],
            ]
        )
        / 5.0
    )
    sa_su = sum([userstats["tr_saturday"], userstats["tr_sunday"]]) / 2.0
    day = sum([userstats["tr_morning"], userstats["tr_afternoon"]]) / 2.0
    night = (
        sum([userstats["tr_earlymorning"], userstats["tr_latenight"], userstats["tr_evening"], userstats["tr_night"]])
        / 4.0
    )

    # avg tweet rate on time off
    tr_weekend = (sa_su + night) / 2.0
    tr_weekday = ((m_fri + day) / 2.0) * 1.3

    if tr_weekday > tr_weekend:
        dim_1 = "A"
    else:
        dim_1 = "E"

    # GO THROUGH LIST OF CELEBS AGAIN AND FIND THOSE WITH SAME PERSONALITY
    matches = []

    for celeb in celebstats:
        if celeb[26] == dim_1 and celeb[27] == dim_2 and celeb[28] == dim_3 and celeb[29] == dim_4:
            matches.append(celeb[20])

    toreturn = [dim_1 + dim_2 + dim_3 + dim_4]
    random.shuffle(matches)
    if len(matches) > 24:
        toreturn.append(matches[0:24])
    else:
        toreturn.append(matches[0 : len(matches)])

    return toreturn
    def Generate(self):
        self.GenerateDocFreqsTable()

        celeb_count = self.GetCelebCount()

        tokens = self.GetTokens()
        
        q = """SELECT token_counts.user, token_counts.c, COUNT(token_user_mapping.user) as total_user_tokens
                  FROM
                    (SELECT t.user, t.token, t.c
                      FROM
                        (SELECT user, token, COUNT(*) as c
                          FROM token_user_mapping
                          WHERE token=%(token)s
                          GROUP BY user) as t
                        ORDER BY t.user) as token_counts, token_user_mapping
                  WHERE token_user_mapping.user = token_counts.user GROUP BY token_user_mapping.user"""

        # ITERATE THROUGH TOKENS
        for token in tokens:
            if len(token) < 3:
                debuglog.msg("token %s too short."%token)
                continue
            elif token[0] == '@':
                debuglog.msg("ignoring user token %s"%token)
                continue
            
            debuglog.msg("Generating tfidf table for token <%s>"%token)
            vals = { 'token':token }
            results = self.sql.q(q, vals)

            if results is None:
                continue
            
            # CALCULATE SCORES
            celebs_with_term = len(results)
            celeb_scores = {}
            for result in results:
                celeb = result[0]
                term_count_for_celeb = result[1]
                total_tokens_for_celeb = result[2]
                celeb_tfidf_for_term = float((Decimal(term_count_for_celeb) / Decimal(total_tokens_for_celeb)) * \
                                       (Decimal(celeb_count) / Decimal(celebs_with_term)))
                celeb_scores[celeb] = (celeb_tfidf_for_term, term_count_for_celeb)

            # GENERATE QUERY
            insert_q = "INSERT INTO celeb_tfidf_all (user, token, score, count) VALUES"

            count = 0
            vals = {'token':token}
            
            for celeb in celeb_scores:
                vals['celeb'+str(count)] = celeb
                vals['score'+str(count)] = str(celeb_scores[celeb][0])
                vals['count'+str(count)] = str(celeb_scores[celeb][1])
                insert_q+= "(%(celeb"+str(count)+")s, %(token)s, %(score"+str(count)+")s, %(count"+str(count)+")s),"
                count += 1

            if len(vals.keys()) >= 4:
                # Remove last comma and add rule for duplicate keys.
                insert_q = insert_q[:len(insert_q)-1] + " ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count);"

                # EXECUTE QUERY
                self.sql.q(insert_q,vals)

            # move high-scoring words over celeb_tfidf
            q = "INSERT INTO celeb_tfidf (SELECT * FROM celeb_tfidf_all WHERE score > 0.005) ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count)"
            self.sql.q(q)