def wrapper(*args, **kw): start_time = time.time() value = func(*args, **kw) debuglog.msg("\nPERFTEST===============\nFunction:\t%s\nTime:\t%s\n"% (func.__module__ + "::" + func.__name__, str(time.time() - start_time)), mode="debug") return value
def wrapper(*args, **kw): start_time = time.time() value = func(*args, **kw) debuglog.msg("\nPERFTEST===============\nFunction:\t%s\nTime:\t%s\n" % (func.__module__ + "::" + func.__name__, str(time.time() - start_time)), mode="debug") return value
def fetchUserTweets(self, user): """ Fetch tweets for user and add to the database. """ data = self.getUserTweetsData(user) for tweet in data['results']: if self.tweet_adder.add(tweet): debuglog.msg("successfully added") else: debuglog.msg("failed to add :(") return True
def fixTokensInterrupted(self): f = open('missing_tweets2.txt') missing_tweets = [line.replace('\n', '') for line in f.readlines()] q = "SELECT text, from_user, id FROM tweets WHERE id IN(" vals = {} q += ','.join(missing_tweets) + ')' results = self.sql.q(q) #pprint.pprint(results) #return failures = [] f = open('token_fix_failures.txt', 'w') for result in results: debuglog.msg("Adding tokens for tweet", result[2]) try: self.addTokens({'text': result[0], 'from_user': result[1]}) self.addTokenMapping({ 'text': result[0], 'from_user': result[1], 'id': result[2] }) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:", len(failures)) f.write(result[2] + "\n") f.close() debuglog.msg(failures)
def checkRateLimit(self): debuglog.pprint_msg(self.rate_data) debuglog.msg("REMAINING HITS:",self.rate_data['remaining_hits']) if time.time() > self.rate_data['reset_time_in_seconds']: self.rate_data = self.fetchRateData() if "error" in self.rate_data: return 60*60 if self.rate_data['remaining_hits'] <= 1: debuglog.msg("rate limit: wait",self.rate_data['reset_time_in_seconds'] - time.time() ) return self.rate_data['reset_time_in_seconds'] - time.time() else: return 0
def fixTokensInterrupted(self): f = open('missing_tweets2.txt') missing_tweets = [line.replace('\n','') for line in f.readlines()] q = "SELECT text, from_user, id FROM tweets WHERE id IN(" vals = {} q += ','.join(missing_tweets) + ')' results = self.sql.q(q) #pprint.pprint(results) #return failures = [] f = open('token_fix_failures.txt','w') for result in results: debuglog.msg("Adding tokens for tweet",result[2]) try: self.addTokens({'text':result[0], 'from_user':result[1]}) self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]}) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:",len(failures)) f.write(result[2]+"\n") f.close() debuglog.msg(failures)
def GetTermIDFs(self, terms): if not terms or not len(terms): return json.loads({"idfs": []}) url = 'http://50.56.221.228/cgi-bin/idf.php?' # TODO: HTML entity encoding (?) # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate. data = ('terms=' + ','.join(terms).replace("#", "%23")).encode("utf-8") debuglog.msg(data) txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read()) txt = txt_unicode.unicode_markup txt = txt.replace(",null:", ',"null":') #workaround data = json.loads(txt, encoding=txt_unicode.original_encoding) return data
def checkRateLimit(self): debuglog.pprint_msg(self.rate_data) debuglog.msg("REMAINING HITS:", self.rate_data['remaining_hits']) if time.time() > self.rate_data['reset_time_in_seconds']: self.rate_data = self.fetchRateData() if "error" in self.rate_data: return 60 * 60 if self.rate_data['remaining_hits'] <= 1: debuglog.msg("rate limit: wait", self.rate_data['reset_time_in_seconds'] - time.time()) return self.rate_data['reset_time_in_seconds'] - time.time() else: return 0
def GetTermIDFs(self, terms): if not terms or not len(terms): return json.loads({"idfs": []}) url = "http://50.56.221.228/cgi-bin/idf.php?" # TODO: HTML entity encoding (?) # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate. data = ("terms=" + ",".join(terms).replace("#", "%23")).encode("utf-8") debuglog.msg(data) txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read()) txt = txt_unicode.unicode_markup txt = txt.replace(",null:", ',"null":') # workaround data = json.loads(txt, encoding=txt_unicode.original_encoding) return data
def q(self, query, values=None, auto_str=True): try: if not values: self.cur.execute(query) else: self.cur.execute(query, values) self.conn.commit() return self.cur.fetchall() except: debuglog.msg("Query failed!") debuglog.msg("Query:",query) if values is not None: debuglog.msg("Vals:") debuglog.pprint_msg(values) #traceback.print_exc(file=sys.stdout) debuglog.msg(traceback.format_exc()) exit
def q(self, query, values=None, auto_str=True): try: if not values: self.cur.execute(query) else: self.cur.execute(query, values) self.conn.commit() return self.cur.fetchall() except: debuglog.msg("Query failed!") debuglog.msg("Query:", query) if values is not None: debuglog.msg("Vals:") debuglog.pprint_msg(values) #traceback.print_exc(file=sys.stdout) debuglog.msg(traceback.format_exc()) exit
def getUserTweetsData(self,user): debuglog.msg("=====\n\nGetting data for @%s from Search API..."%user) try: twitter_query = "from:%s"%user twitter_query = urllib.parse.quote(twitter_query) query_url = "http://search.twitter.com/search.json?lang=en&rpp=100&q=%s"%twitter_query response_unicode = UnicodeDammit(urllib.request.urlopen(query_url).read()) data = json.loads(response_unicode.unicode_markup, encoding=response_unicode.original_encoding if response_unicode.original_encoding else "utf-8") debuglog.msg("\tGot %s tweets for @%s from Search API."%(str(len(data['results'])), user)) return data except: debuglog.msg("\tFailed to get data from Search API :(") debuglog.msg("\t\tURL:\t%s"%query_url) return { 'results': [] }
def fixTokens(self): q = "SELECT text, from_user, id FROM tweets" results = self.sql.q(q) failures = [] f = open('token_fix_failures.txt','w') for result in results: debuglog.msg("Adding tokens for tweet",result[2]) try: self.addTokens({'text':result[0], 'from_user':result[1]}) self.addTokenMapping({'text':result[0], 'from_user':result[1], 'id':result[2]}) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:",len(failures)) f.write(result[2]+"\n") f.close() debuglog.msg(failures)
def getUserTweetsData(self, user): debuglog.msg("=====\n\nGetting data for @%s from Search API..." % user) try: twitter_query = "from:%s" % user twitter_query = urllib.parse.quote(twitter_query) query_url = "http://search.twitter.com/search.json?lang=en&rpp=100&q=%s" % twitter_query response_unicode = UnicodeDammit( urllib.request.urlopen(query_url).read()) data = json.loads(response_unicode.unicode_markup, encoding=response_unicode.original_encoding if response_unicode.original_encoding else "utf-8") debuglog.msg("\tGot %s tweets for @%s from Search API." % (str(len(data['results'])), user)) return data except: debuglog.msg("\tFailed to get data from Search API :(") debuglog.msg("\t\tURL:\t%s" % query_url) return {'results': []}
def fetchTopUserTweets(self, start_at=None): debuglog.msg("Fetching all celebrity tweets...") q = "SELECT DISTINCT user FROM celebs" results = SQLQuery().q(q) users = [result[0] for result in results] if start_at: users = users[users.index(start_at):] for user in users: if self.fetchUserTweets(user): debuglog.msg("\tSuccessfully fetched tweets for @%s :)" % user) else: debuglog.msg("\tFailed to fetch tweets for @%s :(" % user) time.sleep(1)
def fetchTopUserTweets(self, start_at=None): debuglog.msg("Fetching all celebrity tweets...") q = "SELECT DISTINCT user FROM celebs" results = SQLQuery().q(q) users = [result[0] for result in results] if start_at: users = users[users.index(start_at):] for user in users: if self.fetchUserTweets(user): debuglog.msg("\tSuccessfully fetched tweets for @%s :)"%user) else: debuglog.msg("\tFailed to fetch tweets for @%s :("%user) time.sleep(1)
def fixTokens(self): q = "SELECT text, from_user, id FROM tweets" results = self.sql.q(q) failures = [] f = open('token_fix_failures.txt', 'w') for result in results: debuglog.msg("Adding tokens for tweet", result[2]) try: self.addTokens({'text': result[0], 'from_user': result[1]}) self.addTokenMapping({ 'text': result[0], 'from_user': result[1], 'id': result[2] }) except: failures.append(result[2]) debuglog.msg("\tAdding tokens failed!") debuglog.msg("\tFailures so far:", len(failures)) f.write(result[2] + "\n") f.close() debuglog.msg(failures)
def fetchUserTimeline(self, user, format="default", use_cache=True, write_cache=True, use_filesystem_cache=False): # TODO: Clean this function up, format parameter does magic it shouldn't be doing. # Currently, format="default" means that we're adding celebrity timeline tweets, we never call this. # If we do call with format="default" we want to add the timeline tweets to the celebrity tweets table. # This is called from DataGrabber to get user timelines with format="searchapi". In this case we want to check # if we have matching non-celebrity tweets, and if so return them (in future: possibly add new tweets from # search api as well). If not, get tweets from the timeline api, store them in the tweets_non_celeb table, # and return an object with those tweets. # Also, if a user is cached and called with default, we will just get back the cached data and not insert anything. debuglog.msg("Fetching timeline for @%s..."%user) got_cache_data = False json_txt = "{}" json_encoding = "utf-8" if use_cache and not use_filesystem_cache: q = "SELECT * FROM tweets_non_celeb WHERE from_user=%(user)s;" vals = { 'user': user } cached_tweets = self.sql.q(q, vals) if len(cached_tweets) > 0: return [tweet[0] for tweet in cached_tweets] elif use_cache and use_filesystem_cache: debuglog.msg("\tchecking cache...") cached_list = os.listdir('./timelines') userjsonfilename = user.lower()+'.json' if userjsonfilename in cached_list: #modtime = os.stat('./timelines/'+userjsonfilename)[ST_MTIME] ##cache stays fresh for a day #if ((float(time.time()) - modtime)/60)/60 <= 24: debuglog.msg("\t\tgot cache data.") json_txt = open('./timelines/'+userjsonfilename,'r').read() got_cache_data = True if not got_cache_data: debuglog.msg("\tNo cache data, calling timeline api...") if self.checkRateLimit() > 0: debuglog.msg("\t\tHave to wait.") return {'status':'wait'} url = "https://api.twitter.com/1/statuses/user_timeline.json?&screen_name=%s&count=150"%user debuglog.msg(url) try: response = urllib.request.urlopen(url) debuglog.msg(response.info()) except urllib.error.HTTPError as e: if "404" in str(e): return {'status':'404'} elif "502" in str(e): return {'status':'retry'} else: return {'status':'error'} json_unicode = UnicodeDammit(response.read()) json_txt = json_unicode.unicode_markup if json_unicode.original_encoding: json_encoding = json_unicode.original_encoding if write_cache and use_filesystem_cache: fname = './timelines/'+user.lower()+'.json' with open(fname,'wt') as f: os.chmod(fname, 0o777) f.write(json_txt) data = json.loads(json_txt, encoding=json_encoding) debuglog.msg("\tdata is...",str(data)[:100]) if format == "searchapi": # For now, format="searchapi" indicates we are getting non-celebrity tweets. debuglog.msg("\tGot %d results for %s from user timeline API."%(len(data),user)) if write_cache and not use_filesystem_cache: for non_celeb_timeline_tweet in data: self.tweet_adder.addNonCelebTimelineTweet(non_celeb_timeline_tweet) return { 'results':data } # For now, format="default" (only way to reach here) means we are adding celebrity tweets. for timeline_tweet in data: self.tweet_adder.addTimelineTweet(timeline_tweet) return {'status':'success'}
def fetchUserTimeline(self, user, format="default", use_cache=True, write_cache=True, use_filesystem_cache=False): # TODO: Clean this function up, format parameter does magic it shouldn't be doing. # Currently, format="default" means that we're adding celebrity timeline tweets, we never call this. # If we do call with format="default" we want to add the timeline tweets to the celebrity tweets table. # This is called from DataGrabber to get user timelines with format="searchapi". In this case we want to check # if we have matching non-celebrity tweets, and if so return them (in future: possibly add new tweets from # search api as well). If not, get tweets from the timeline api, store them in the tweets_non_celeb table, # and return an object with those tweets. # Also, if a user is cached and called with default, we will just get back the cached data and not insert anything. debuglog.msg("Fetching timeline for @%s..." % user) got_cache_data = False json_txt = "{}" json_encoding = "utf-8" if use_cache and not use_filesystem_cache: q = "SELECT * FROM tweets_non_celeb WHERE from_user=%(user)s;" vals = {'user': user} cached_tweets = self.sql.q(q, vals) if len(cached_tweets) > 0: return [tweet[0] for tweet in cached_tweets] elif use_cache and use_filesystem_cache: debuglog.msg("\tchecking cache...") cached_list = os.listdir('./timelines') userjsonfilename = user.lower() + '.json' if userjsonfilename in cached_list: #modtime = os.stat('./timelines/'+userjsonfilename)[ST_MTIME] ##cache stays fresh for a day #if ((float(time.time()) - modtime)/60)/60 <= 24: debuglog.msg("\t\tgot cache data.") json_txt = open('./timelines/' + userjsonfilename, 'r').read() got_cache_data = True if not got_cache_data: debuglog.msg("\tNo cache data, calling timeline api...") if self.checkRateLimit() > 0: debuglog.msg("\t\tHave to wait.") return {'status': 'wait'} url = "https://api.twitter.com/1/statuses/user_timeline.json?&screen_name=%s&count=150" % user debuglog.msg(url) try: response = urllib.request.urlopen(url) debuglog.msg(response.info()) except urllib.error.HTTPError as e: if "404" in str(e): return {'status': '404'} elif "502" in str(e): return {'status': 'retry'} else: return {'status': 'error'} json_unicode = UnicodeDammit(response.read()) json_txt = json_unicode.unicode_markup if json_unicode.original_encoding: json_encoding = json_unicode.original_encoding if write_cache and use_filesystem_cache: fname = './timelines/' + user.lower() + '.json' with open(fname, 'wt') as f: os.chmod(fname, 0o777) f.write(json_txt) data = json.loads(json_txt, encoding=json_encoding) debuglog.msg("\tdata is...", str(data)[:100]) if format == "searchapi": # For now, format="searchapi" indicates we are getting non-celebrity tweets. debuglog.msg("\tGot %d results for %s from user timeline API." % (len(data), user)) if write_cache and not use_filesystem_cache: for non_celeb_timeline_tweet in data: self.tweet_adder.addNonCelebTimelineTweet( non_celeb_timeline_tweet) return {'results': data} # For now, format="default" (only way to reach here) means we are adding celebrity tweets. for timeline_tweet in data: self.tweet_adder.addTimelineTweet(timeline_tweet) return {'status': 'success'}
def fetchTopUserTimelines(self): top_users = open('update_users.txt', 'r').readlines() top_users = [user.replace('\n', '') for user in top_users] for user in top_users: debuglog.msg("Getting timeline for", user) status = 'retry' while status == 'retry' or status == 'wait': debuglog.msg(status) debuglog.msg("\tFetching timeline for @%s in %s seconds..." % (user, str(self.checkRateLimit()))) status = self.fetchUserTimeline(user)['status'] time.sleep(1) time.sleep(self.checkRateLimit()) if status == 'success': debuglog.msg("\tGot timeline for %s :)" % user) elif status == '404': debuglog.msg("\tUser not found.") else: debuglog.msg( "\tUnknown error prevented getting user timeline.")
def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"): """ Adds a tweet to tweet_table (celebrity tweet table by default). Tweet must be in the format provided by Search API. """ if not self.ids: self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")] debuglog.msg("Inserting tweet", tweet['id']) #debuglog.pprint_msg(tweet) if not created_at_is_obj: dt = datetime.datetime.strptime(replaceMonth(tweet['created_at'][5:25]),"%d %m %Y %H:%M:%S") else: dt = tweet['created_at'] created_at = dt.strftime("%Y-%m-%d %H:%M:%S") dicvals = {'created_at':created_at, 'from_user':tweet['from_user'], 'from_user_id':tweet['from_user_id'], 'from_user_name':tweet['from_user_name'], 'geo':str(tweet['geo']), 'id':tweet['id'], 'iso_language_code':tweet['iso_language_code'], 'metadata':str(tweet['metadata']), 'profile_image_url':tweet['profile_image_url'], 'source':tweet['source'], 'text':tweet['text'], 'to_user':tweet['to_user'], 'to_user_id':tweet['to_user_id'], 'to_user_name':tweet['to_user_name']} dicq= "INSERT IGNORE INTO " + tweet_table dicq += """ VALUES(%(created_at)s, %(from_user)s, %(from_user_id)s, %(from_user_name)s, %(geo)s, %(id)s, %(iso_language_code)s, %(metadata)s, %(profile_image_url)s, %(source)s, %(text)s, %(to_user)s, %(to_user_id)s, %(to_user_name)s)""" if tweet['id'] not in self.ids: succeeded = False try: self.sql.q(dicq,dicvals) succeeded = True except UnicodeEncodeError: try: debuglog.msg("\tUNIDECODE ERROR, trying decode...") for k in dicvals: dicvals[k] = unidecode(dicvals[k]) self.sql.q(dicq,dicvals) succeeded = True except: debuglog.msg("\tUnidecode failed :(") if succeeded and tweet_table == 'tweets': tokens = self.tfidf_obj.get_tokens(tweet['text']) self.addTokens(tweet,tokens) self.addTokenMapping(tweet, tokens) return succeeded debuglog.msg("\ttweet already existed") return False
def getCelebMatches(userdata, celebstats): #FIRST CALCULATE BASE STATS FOR USER #declare all counts num_days = 0.0 num_per_time = [0.0] * 6 num_per_weekday = [0.0] * 7 num_at = 0.0 num_rt = 0.0 num_hash = 0.0 num_links = 0.0 mentions = [] hashes = [] user_name = "" #tweets per day logic depends on results coming back in chronological order #MAKE SURE THIS IS ALWAYS THE CASE IN SEARCH API, REST API, CACHE #IF NOT, NEED TO SORT AS PRE PROCESSING STEP #JON NEEDS TO PUT IN A SIMILAR CHECK, NON EXISTENT USERS & USERS W 0 TWEETS CRASH CODE CURRENTLY if len(userdata['results']) == 0 or type(userdata['results']) is dict: return {} else: if (len(userdata['results']) > 0): created_at = tweetadder.replaceMonth( userdata['results'][0]['created_at']) cur_datetime = datetime.datetime(int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18])) num_days += 1 user_name = userdata['results'][0]["user"]["screen_name"] for tweet in userdata['results']: created_at = tweetadder.replaceMonth(tweet['created_at']) created = datetime.datetime(int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18])) text = tweet['text'] #update day count if created.day != cur_datetime.day or created.month != cur_datetime.month or created.year != cur_datetime.year: cur_datetime = created num_days += 1 #update num_per_time count num_per_time[math.floor(created.hour / 4)] += 1 #update num_per_weekday count num_per_weekday[created.weekday()] += 1 #Get RT @ and # counts link = False mention = False rt = False has = False for word in text.split(" "): if "http://" in word and not link: num_links += 1 link = True if len(word) > 0 and word[0] == "@" and word[1:] != user_name: mentions.append(word) if not mention: num_at += 1 mention = True if "RT" == word and not rt: num_rt += 1 rt = True if len(word) > 0 and word[0] == "#": hashes.append(word) if not has: num_hash += 1 has = True mention_count = collections.Counter(mentions) unique_mentions = -1.0 if len(mentions) != 0: unique_mentions = float(len(mention_count)) / len(mentions) hash_count = collections.Counter(hashes) unique_hashes = -1.0 if len(hashes) != 0: unique_hashes = float(len(hash_count)) / len(hashes) total_tweets = len(userdata['results']) userstats = {} if total_tweets != 0: userstats = { "tr_day": float(total_tweets) / num_days, "tr_monday": num_per_weekday[0] / total_tweets, "tr_tuesday": num_per_weekday[1] / total_tweets, "tr_wednesday": num_per_weekday[2] / total_tweets, "tr_thursday": num_per_weekday[3] / total_tweets, "tr_friday": num_per_weekday[4] / total_tweets, "tr_saturday": num_per_weekday[5] / total_tweets, "tr_sunday": num_per_weekday[6] / total_tweets, "tr_latenight": num_per_time[0] / total_tweets, "tr_earlymorning": num_per_time[1] / total_tweets, "tr_morning": num_per_time[2] / total_tweets, "tr_afternoon": num_per_time[3] / total_tweets, "tr_evening": num_per_time[4] / total_tweets, "tr_night": num_per_time[5] / total_tweets, "mention_rate": float(num_at) / total_tweets, "retweet_rate": float(num_rt) / total_tweets, "hash_rate": float(num_hash) / total_tweets, "link_rate": float(num_links) / total_tweets, "unique_hash": unique_hashes, "unique_mention": unique_mentions, "user": user_name } #calculate percentile stats for user N = 0 B_m = 0 E_m = 0 B_h = 0 E_h = 0 B_l = 0 E_l = 0 B_um = 0 E_um = 0 B_uh = 0 E_uh = 0 for celeb in celebstats: #counts for percentiles N += 1 if celeb[14] < userstats["mention_rate"]: B_m += 1 elif celeb[14] == userstats["mention_rate"]: E_m += 1 if celeb[16] < userstats["hash_rate"]: B_h += 1 elif celeb[16] == userstats["hash_rate"]: E_h += 1 if celeb[17] < userstats["link_rate"]: B_l += 1 elif celeb[17] == userstats["link_rate"]: E_l += 1 if celeb[18] < userstats["unique_hash"]: B_uh += 1 elif celeb[18] == userstats["unique_hash"]: E_uh += 1 if celeb[19] < userstats["unique_mention"]: B_um += 1 elif celeb[19] == userstats["unique_mention"]: E_um += 1 #CALCULATE PERCENTILES P_m = ((B_m + 0.5 * E_m) / N) * 100 P_h = ((B_h + 0.5 * E_h) / N) * 100 P_l = ((B_l + 0.5 * E_l) / N) * 100 P_um = ((B_um + 0.5 * E_um) / N) * 100 P_uh = ((B_uh + 0.5 * E_uh) / N) * 100 #use all info about user to get personality if P_l > float(50.0): dim_2 = "S" else: debuglog.msg(P_l) dim_2 = "C" if (P_m + P_um) / float(2.0) > float(50.0): dim_3 = "W" else: dim_3 = "T" if (P_h + P_uh) / float(2.0) > float(50.0): dim_4 = "J" else: dim_4 = "M" #get average tweet rates m_fri = sum([ userstats["tr_monday"], userstats["tr_tuesday"], userstats["tr_wednesday"], userstats["tr_thursday"], userstats["tr_friday"] ]) / 5.0 sa_su = sum([userstats["tr_saturday"], userstats["tr_sunday"]]) / 2.0 day = sum([userstats["tr_morning"], userstats["tr_afternoon"]]) / 2.0 night = sum([ userstats["tr_earlymorning"], userstats["tr_latenight"], userstats["tr_evening"], userstats["tr_night"] ]) / 4.0 #avg tweet rate on time off tr_weekend = (sa_su + night) / 2.0 tr_weekday = ((m_fri + day) / 2.0) * 1.3 if tr_weekday > tr_weekend: dim_1 = "A" else: dim_1 = "E" #GO THROUGH LIST OF CELEBS AGAIN AND FIND THOSE WITH SAME PERSONALITY matches = [] for celeb in celebstats: if celeb[26] == dim_1 and celeb[27] == dim_2 and celeb[ 28] == dim_3 and celeb[29] == dim_4: matches.append(celeb[20]) toreturn = [dim_1 + dim_2 + dim_3 + dim_4] random.shuffle(matches) if len(matches) > 24: toreturn.append(matches[0:24]) else: toreturn.append(matches[0:len(matches)]) return toreturn
def GetCelebMatchesForUser(self, user): """ Generate object with information about user and matches with celeb (including matching tweets) to pass to the UI. TODO: Break this into smaller functions, it's way too big. """ results = { "user": {"screen_name": user, "name": "", "pic_url": "", "personality": ""}, "celeb_matches": [], "celeb_matches_pers": [], } # GET USER TWEETS user_data = self.GetUserTweets(user) # return an error if user doesn't exist/has no tweets. if user_data is None or not len(user_data["results"]): results["status"] = "error" return results results["user"]["name"] = user_data["results"][0]["user"]["name"] results["user"]["pic_url"] = user_data["results"][0]["user"]["profile_image_url"] # Pass user_data and celeb_stats to get celeb matches celeb_stats = self.GetCelebTweetStats() celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats) celebs = celeb_matches[1] results["user"]["personality"] = celeb_matches[0] # get pic urls for celeb pers matches if len(celebs) > 0: q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"token" + str(count)] = celeb q = q + "%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user" q_results = self.sql.q(q, vals) celeb_match_pers_array = [] for res in q_results: celeb_match_pers_array.append([res[0], res[1]]) results["celeb_matches_pers"] = celeb_match_pers_array # GET USER TFIDF user_tfidf = self.GetUserTFIDFs(user_data) user_scores = user_tfidf["scores_dic"] debuglog.msg("top user terms are", user_tfidf["scores_list"][:15]) # GET CELEBS TFIDF celeb_scores = self.GetCelebTFIDFsForTerms([term[0] for term in user_tfidf["scores_list"]][:15]) # CALCULATE MATCH SCORES cumulative_celeb_scores = {} celeb_words = {} for entry in celeb_scores: celeb = entry[0] if celeb.lower() == user.lower(): continue token = unidecode.unidecode(entry[1]) score = float(entry[2]) if celeb in cumulative_celeb_scores: celeb_words[celeb][token] = score cumulative_celeb_scores[celeb] += user_scores[token] * score else: celeb_words[celeb] = {token: score} cumulative_celeb_scores[celeb] = user_scores[token] * score matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb]) for celeb in cumulative_celeb_scores] matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]]) # FIND MATCHING TWEETS FOR TOP 10 CELEBS for top_10_celeb_index in range(min(10, len(matches))): celeb_match = { "screen_name": matches[top_10_celeb_index][0], "name": "", "pic_url": "", "match_score": cumulative_celeb_scores[matches[top_10_celeb_index][0]], #'top_words' : matches[top_10_celeb_index][2], "top_words": {}, "tweets": [], } # vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])} # q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)" q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN (" vals = {"celeb": matches[top_10_celeb_index][0]} count = 0 for token in list(matches[top_10_celeb_index][2].keys()): # TODO Clean up stopword filtering if token.lower() not in self.stopwords: vals["token" + str(count)] = token q += "%(token" + str(count) + ")s, " count += 1 # trim last comma and space. if count: q = q[: len(q) - 2] q += ")) as t WHERE tweets.id=t.tweet_id;" q_results = self.sql.q(q, vals) # skip if we don't have any matching celeb tweets. if not q_results or not len(q_results): continue celeb_match["name"] = q_results[0][2] celeb_match["pic_url"] = q_results[0][3] matching_celeb_tweets = [{"text": result[0], "id": result[1]} for result in q_results] matches[top_10_celeb_index] = list(matches[top_10_celeb_index]) # ADD TWEETS THAT MATCH ON TOKENS sorted_tokens = [ token for token in sorted( matches[top_10_celeb_index][2].keys(), key=lambda x: -matches[top_10_celeb_index][2][x] ) ] # TODO Clean up stopword filtering for token in list(filter(lambda x: x not in self.stopwords, sorted_tokens)): celeb_tweets_for_token = list( filter(lambda x: x["text"].lower().count(token.lower()) > 0, matching_celeb_tweets) ) user_tweets_for_token = [ user_tfidf["tweets"][user_tfidf["token_mapping"][token][k]] for k in range(len(user_tfidf["token_mapping"][token])) ] if len(celeb_tweets_for_token) or len(user_tweets_for_token): celeb_match["top_words"][token] = matches[top_10_celeb_index][2][token] for matching_tweets_for_token_index in range( min(len(celeb_tweets_for_token), len(user_tweets_for_token)) ): celeb_match["tweets"].append( { "word": token, "user_tweet": { "url": "http://twitter.com/" + user_tweets_for_token[matching_tweets_for_token_index]["user"]["screen_name"] + "/status/" + str(user_tweets_for_token[matching_tweets_for_token_index]["id"]), "text": user_tweets_for_token[matching_tweets_for_token_index]["text"], }, "celeb_tweet": { "url": "http://twitter.com/" + celeb_match["screen_name"] + "/status/" + str(celeb_tweets_for_token[matching_tweets_for_token_index]["id"]), "text": celeb_tweets_for_token[matching_tweets_for_token_index]["text"], }, } ) # matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets}) if len(celeb_match["tweets"]): results["celeb_matches"].append(celeb_match) results["status"] = "ok" results["permalink_id"] = self.StorePermalink(results) return results
def Generate(self): self.GenerateDocFreqsTable() celeb_count = self.GetCelebCount() tokens = self.GetTokens() q = """SELECT token_counts.user, token_counts.c, COUNT(token_user_mapping.user) as total_user_tokens FROM (SELECT t.user, t.token, t.c FROM (SELECT user, token, COUNT(*) as c FROM token_user_mapping WHERE token=%(token)s GROUP BY user) as t ORDER BY t.user) as token_counts, token_user_mapping WHERE token_user_mapping.user = token_counts.user GROUP BY token_user_mapping.user""" # ITERATE THROUGH TOKENS for token in tokens: if len(token) < 3: debuglog.msg("token %s too short." % token) continue elif token[0] == '@': debuglog.msg("ignoring user token %s" % token) continue debuglog.msg("Generating tfidf table for token <%s>" % token) vals = {'token': token} results = self.sql.q(q, vals) if results is None: continue # CALCULATE SCORES celebs_with_term = len(results) celeb_scores = {} for result in results: celeb = result[0] term_count_for_celeb = result[1] total_tokens_for_celeb = result[2] celeb_tfidf_for_term = float((Decimal(term_count_for_celeb) / Decimal(total_tokens_for_celeb)) * \ (Decimal(celeb_count) / Decimal(celebs_with_term))) celeb_scores[celeb] = (celeb_tfidf_for_term, term_count_for_celeb) # GENERATE QUERY insert_q = "INSERT INTO celeb_tfidf_all (user, token, score, count) VALUES" count = 0 vals = {'token': token} for celeb in celeb_scores: vals['celeb' + str(count)] = celeb vals['score' + str(count)] = str(celeb_scores[celeb][0]) vals['count' + str(count)] = str(celeb_scores[celeb][1]) insert_q += "(%(celeb" + str( count) + ")s, %(token)s, %(score" + str( count) + ")s, %(count" + str(count) + ")s)," count += 1 if len(vals.keys()) >= 4: # Remove last comma and add rule for duplicate keys. insert_q = insert_q[:len( insert_q ) - 1] + " ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count);" # EXECUTE QUERY self.sql.q(insert_q, vals) # move high-scoring words over celeb_tfidf q = "INSERT INTO celeb_tfidf (SELECT * FROM celeb_tfidf_all WHERE score > 0.005) ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count)" self.sql.q(q)
def add(self, tweet, created_at_is_obj=False, tweet_table="tweets"): """ Adds a tweet to tweet_table (celebrity tweet table by default). Tweet must be in the format provided by Search API. """ if not self.ids: self.ids = [i[0] for i in self.sql.q("SELECT id FROM tweets")] debuglog.msg("Inserting tweet", tweet['id']) #debuglog.pprint_msg(tweet) if not created_at_is_obj: dt = datetime.datetime.strptime( replaceMonth(tweet['created_at'][5:25]), "%d %m %Y %H:%M:%S") else: dt = tweet['created_at'] created_at = dt.strftime("%Y-%m-%d %H:%M:%S") dicvals = { 'created_at': created_at, 'from_user': tweet['from_user'], 'from_user_id': tweet['from_user_id'], 'from_user_name': tweet['from_user_name'], 'geo': str(tweet['geo']), 'id': tweet['id'], 'iso_language_code': tweet['iso_language_code'], 'metadata': str(tweet['metadata']), 'profile_image_url': tweet['profile_image_url'], 'source': tweet['source'], 'text': tweet['text'], 'to_user': tweet['to_user'], 'to_user_id': tweet['to_user_id'], 'to_user_name': tweet['to_user_name'] } dicq = "INSERT IGNORE INTO " + tweet_table dicq += """ VALUES(%(created_at)s, %(from_user)s, %(from_user_id)s, %(from_user_name)s, %(geo)s, %(id)s, %(iso_language_code)s, %(metadata)s, %(profile_image_url)s, %(source)s, %(text)s, %(to_user)s, %(to_user_id)s, %(to_user_name)s)""" if tweet['id'] not in self.ids: succeeded = False try: self.sql.q(dicq, dicvals) succeeded = True except UnicodeEncodeError: try: debuglog.msg("\tUNIDECODE ERROR, trying decode...") for k in dicvals: dicvals[k] = unidecode(dicvals[k]) self.sql.q(dicq, dicvals) succeeded = True except: debuglog.msg("\tUnidecode failed :(") if succeeded and tweet_table == 'tweets': tokens = self.tfidf_obj.get_tokens(tweet['text']) self.addTokens(tweet, tokens) self.addTokenMapping(tweet, tokens) return succeeded debuglog.msg("\ttweet already existed") return False
def GetCelebMatchesForUser(self, user): """ Generate object with information about user and matches with celeb (including matching tweets) to pass to the UI. TODO: Break this into smaller functions, it's way too big. """ results = { 'user': { 'screen_name': user, 'name': '', 'pic_url': '', 'personality': '' }, 'celeb_matches': [], 'celeb_matches_pers': [] } # GET USER TWEETS user_data = self.GetUserTweets(user) # return an error if user doesn't exist/has no tweets. if user_data is None or not len(user_data['results']): results['status'] = 'error' return results results['user']['name'] = user_data['results'][0]['user']['name'] results['user']['pic_url'] = user_data['results'][0]['user'][ 'profile_image_url'] #Pass user_data and celeb_stats to get celeb matches celeb_stats = self.GetCelebTweetStats() celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats) celebs = celeb_matches[1] results['user']['personality'] = celeb_matches[0] #get pic urls for celeb pers matches if len(celebs) > 0: q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user" q_results = self.sql.q(q, vals) celeb_match_pers_array = [] for res in q_results: celeb_match_pers_array.append([res[0], res[1]]) results['celeb_matches_pers'] = celeb_match_pers_array # GET USER TFIDF user_tfidf = self.GetUserTFIDFs(user_data) user_scores = user_tfidf['scores_dic'] debuglog.msg("top user terms are", user_tfidf['scores_list'][:15]) # GET CELEBS TFIDF celeb_scores = self.GetCelebTFIDFsForTerms( [term[0] for term in user_tfidf['scores_list']][:15]) # CALCULATE MATCH SCORES cumulative_celeb_scores = {} celeb_words = {} for entry in celeb_scores: celeb = entry[0] if celeb.lower() == user.lower(): continue token = unidecode.unidecode(entry[1]) score = float(entry[2]) if celeb in cumulative_celeb_scores: celeb_words[celeb][token] = score cumulative_celeb_scores[celeb] += user_scores[token] * score else: celeb_words[celeb] = {token: score} cumulative_celeb_scores[celeb] = user_scores[token] * score matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb]) for celeb in cumulative_celeb_scores] matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]]) # FIND MATCHING TWEETS FOR TOP 10 CELEBS for top_10_celeb_index in range(min(10, len(matches))): celeb_match = { 'screen_name': matches[top_10_celeb_index][0], 'name': '', 'pic_url': '', 'match_score': cumulative_celeb_scores[matches[top_10_celeb_index][0]], #'top_words' : matches[top_10_celeb_index][2], 'top_words': {}, 'tweets': [] } #vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])} #q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)" q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN (" vals = {'celeb': matches[top_10_celeb_index][0]} count = 0 for token in list(matches[top_10_celeb_index][2].keys()): # TODO Clean up stopword filtering if token.lower() not in self.stopwords: vals['token' + str(count)] = token q += '%(token' + str(count) + ')s, ' count += 1 # trim last comma and space. if count: q = q[:len(q) - 2] q += ")) as t WHERE tweets.id=t.tweet_id;" q_results = self.sql.q(q, vals) # skip if we don't have any matching celeb tweets. if not q_results or not len(q_results): continue celeb_match['name'] = q_results[0][2] celeb_match['pic_url'] = q_results[0][3] matching_celeb_tweets = [{ 'text': result[0], 'id': result[1] } for result in q_results] matches[top_10_celeb_index] = list(matches[top_10_celeb_index]) # ADD TWEETS THAT MATCH ON TOKENS sorted_tokens = [ token for token in sorted( matches[top_10_celeb_index][2].keys(), key=lambda x: -matches[top_10_celeb_index][2][x]) ] # TODO Clean up stopword filtering for token in list( filter(lambda x: x not in self.stopwords, sorted_tokens)): celeb_tweets_for_token = list( filter( lambda x: x['text'].lower().count(token.lower()) > 0, matching_celeb_tweets)) user_tweets_for_token = [ user_tfidf['tweets'][user_tfidf['token_mapping'][token][k]] for k in range(len(user_tfidf['token_mapping'][token])) ] if len(celeb_tweets_for_token) or len(user_tweets_for_token): celeb_match['top_words'][token] = matches[ top_10_celeb_index][2][token] for matching_tweets_for_token_index in range( min(len(celeb_tweets_for_token), len(user_tweets_for_token))): celeb_match['tweets'].append({ 'word': token, 'user_tweet': { 'url': 'http://twitter.com/' + user_tweets_for_token[ matching_tweets_for_token_index]['user'] ['screen_name'] + '/status/' + str(user_tweets_for_token[ matching_tweets_for_token_index]['id']), 'text': user_tweets_for_token[ matching_tweets_for_token_index]['text'] }, 'celeb_tweet': { 'url': 'http://twitter.com/' + celeb_match['screen_name'] + '/status/' + str(celeb_tweets_for_token[ matching_tweets_for_token_index]['id']), 'text': celeb_tweets_for_token[ matching_tweets_for_token_index]['text'] } }) #matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets}) if len(celeb_match['tweets']): results['celeb_matches'].append(celeb_match) results['status'] = 'ok' results['permalink_id'] = self.StorePermalink(results) return results
def fetchTopUserTimelines(self): top_users = open('update_users.txt','r').readlines() top_users = [user.replace('\n','') for user in top_users] for user in top_users: debuglog.msg("Getting timeline for",user) status='retry' while status == 'retry' or status=='wait': debuglog.msg(status) debuglog.msg("\tFetching timeline for @%s in %s seconds..."%(user, str(self.checkRateLimit()))) status = self.fetchUserTimeline(user)['status'] time.sleep(1) time.sleep(self.checkRateLimit()) if status == 'success': debuglog.msg("\tGot timeline for %s :)"%user) elif status == '404': debuglog.msg("\tUser not found.") else: debuglog.msg("\tUnknown error prevented getting user timeline.")
def getCelebMatches(userdata, celebstats): # FIRST CALCULATE BASE STATS FOR USER # declare all counts num_days = 0.0 num_per_time = [0.0] * 6 num_per_weekday = [0.0] * 7 num_at = 0.0 num_rt = 0.0 num_hash = 0.0 num_links = 0.0 mentions = [] hashes = [] user_name = "" # tweets per day logic depends on results coming back in chronological order # MAKE SURE THIS IS ALWAYS THE CASE IN SEARCH API, REST API, CACHE # IF NOT, NEED TO SORT AS PRE PROCESSING STEP # JON NEEDS TO PUT IN A SIMILAR CHECK, NON EXISTENT USERS & USERS W 0 TWEETS CRASH CODE CURRENTLY if len(userdata["results"]) == 0 or type(userdata["results"]) is dict: return {} else: if len(userdata["results"]) > 0: created_at = tweetadder.replaceMonth(userdata["results"][0]["created_at"]) cur_datetime = datetime.datetime( int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18]), ) num_days += 1 user_name = userdata["results"][0]["user"]["screen_name"] for tweet in userdata["results"]: created_at = tweetadder.replaceMonth(tweet["created_at"]) created = datetime.datetime( int(created_at[25:]), int(created_at[4:6]), int(created_at[7:9]), int(created_at[10:12]), int(created_at[13:15]), int(created_at[16:18]), ) text = tweet["text"] # update day count if ( created.day != cur_datetime.day or created.month != cur_datetime.month or created.year != cur_datetime.year ): cur_datetime = created num_days += 1 # update num_per_time count num_per_time[math.floor(created.hour / 4)] += 1 # update num_per_weekday count num_per_weekday[created.weekday()] += 1 # Get RT @ and # counts link = False mention = False rt = False has = False for word in text.split(" "): if "http://" in word and not link: num_links += 1 link = True if len(word) > 0 and word[0] == "@" and word[1:] != user_name: mentions.append(word) if not mention: num_at += 1 mention = True if "RT" == word and not rt: num_rt += 1 rt = True if len(word) > 0 and word[0] == "#": hashes.append(word) if not has: num_hash += 1 has = True mention_count = collections.Counter(mentions) unique_mentions = -1.0 if len(mentions) != 0: unique_mentions = float(len(mention_count)) / len(mentions) hash_count = collections.Counter(hashes) unique_hashes = -1.0 if len(hashes) != 0: unique_hashes = float(len(hash_count)) / len(hashes) total_tweets = len(userdata["results"]) userstats = {} if total_tweets != 0: userstats = { "tr_day": float(total_tweets) / num_days, "tr_monday": num_per_weekday[0] / total_tweets, "tr_tuesday": num_per_weekday[1] / total_tweets, "tr_wednesday": num_per_weekday[2] / total_tweets, "tr_thursday": num_per_weekday[3] / total_tweets, "tr_friday": num_per_weekday[4] / total_tweets, "tr_saturday": num_per_weekday[5] / total_tweets, "tr_sunday": num_per_weekday[6] / total_tweets, "tr_latenight": num_per_time[0] / total_tweets, "tr_earlymorning": num_per_time[1] / total_tweets, "tr_morning": num_per_time[2] / total_tweets, "tr_afternoon": num_per_time[3] / total_tweets, "tr_evening": num_per_time[4] / total_tweets, "tr_night": num_per_time[5] / total_tweets, "mention_rate": float(num_at) / total_tweets, "retweet_rate": float(num_rt) / total_tweets, "hash_rate": float(num_hash) / total_tweets, "link_rate": float(num_links) / total_tweets, "unique_hash": unique_hashes, "unique_mention": unique_mentions, "user": user_name, } # calculate percentile stats for user N = 0 B_m = 0 E_m = 0 B_h = 0 E_h = 0 B_l = 0 E_l = 0 B_um = 0 E_um = 0 B_uh = 0 E_uh = 0 for celeb in celebstats: # counts for percentiles N += 1 if celeb[14] < userstats["mention_rate"]: B_m += 1 elif celeb[14] == userstats["mention_rate"]: E_m += 1 if celeb[16] < userstats["hash_rate"]: B_h += 1 elif celeb[16] == userstats["hash_rate"]: E_h += 1 if celeb[17] < userstats["link_rate"]: B_l += 1 elif celeb[17] == userstats["link_rate"]: E_l += 1 if celeb[18] < userstats["unique_hash"]: B_uh += 1 elif celeb[18] == userstats["unique_hash"]: E_uh += 1 if celeb[19] < userstats["unique_mention"]: B_um += 1 elif celeb[19] == userstats["unique_mention"]: E_um += 1 # CALCULATE PERCENTILES P_m = ((B_m + 0.5 * E_m) / N) * 100 P_h = ((B_h + 0.5 * E_h) / N) * 100 P_l = ((B_l + 0.5 * E_l) / N) * 100 P_um = ((B_um + 0.5 * E_um) / N) * 100 P_uh = ((B_uh + 0.5 * E_uh) / N) * 100 # use all info about user to get personality if P_l > float(50.0): dim_2 = "S" else: debuglog.msg(P_l) dim_2 = "C" if (P_m + P_um) / float(2.0) > float(50.0): dim_3 = "W" else: dim_3 = "T" if (P_h + P_uh) / float(2.0) > float(50.0): dim_4 = "J" else: dim_4 = "M" # get average tweet rates m_fri = ( sum( [ userstats["tr_monday"], userstats["tr_tuesday"], userstats["tr_wednesday"], userstats["tr_thursday"], userstats["tr_friday"], ] ) / 5.0 ) sa_su = sum([userstats["tr_saturday"], userstats["tr_sunday"]]) / 2.0 day = sum([userstats["tr_morning"], userstats["tr_afternoon"]]) / 2.0 night = ( sum([userstats["tr_earlymorning"], userstats["tr_latenight"], userstats["tr_evening"], userstats["tr_night"]]) / 4.0 ) # avg tweet rate on time off tr_weekend = (sa_su + night) / 2.0 tr_weekday = ((m_fri + day) / 2.0) * 1.3 if tr_weekday > tr_weekend: dim_1 = "A" else: dim_1 = "E" # GO THROUGH LIST OF CELEBS AGAIN AND FIND THOSE WITH SAME PERSONALITY matches = [] for celeb in celebstats: if celeb[26] == dim_1 and celeb[27] == dim_2 and celeb[28] == dim_3 and celeb[29] == dim_4: matches.append(celeb[20]) toreturn = [dim_1 + dim_2 + dim_3 + dim_4] random.shuffle(matches) if len(matches) > 24: toreturn.append(matches[0:24]) else: toreturn.append(matches[0 : len(matches)]) return toreturn
def Generate(self): self.GenerateDocFreqsTable() celeb_count = self.GetCelebCount() tokens = self.GetTokens() q = """SELECT token_counts.user, token_counts.c, COUNT(token_user_mapping.user) as total_user_tokens FROM (SELECT t.user, t.token, t.c FROM (SELECT user, token, COUNT(*) as c FROM token_user_mapping WHERE token=%(token)s GROUP BY user) as t ORDER BY t.user) as token_counts, token_user_mapping WHERE token_user_mapping.user = token_counts.user GROUP BY token_user_mapping.user""" # ITERATE THROUGH TOKENS for token in tokens: if len(token) < 3: debuglog.msg("token %s too short."%token) continue elif token[0] == '@': debuglog.msg("ignoring user token %s"%token) continue debuglog.msg("Generating tfidf table for token <%s>"%token) vals = { 'token':token } results = self.sql.q(q, vals) if results is None: continue # CALCULATE SCORES celebs_with_term = len(results) celeb_scores = {} for result in results: celeb = result[0] term_count_for_celeb = result[1] total_tokens_for_celeb = result[2] celeb_tfidf_for_term = float((Decimal(term_count_for_celeb) / Decimal(total_tokens_for_celeb)) * \ (Decimal(celeb_count) / Decimal(celebs_with_term))) celeb_scores[celeb] = (celeb_tfidf_for_term, term_count_for_celeb) # GENERATE QUERY insert_q = "INSERT INTO celeb_tfidf_all (user, token, score, count) VALUES" count = 0 vals = {'token':token} for celeb in celeb_scores: vals['celeb'+str(count)] = celeb vals['score'+str(count)] = str(celeb_scores[celeb][0]) vals['count'+str(count)] = str(celeb_scores[celeb][1]) insert_q+= "(%(celeb"+str(count)+")s, %(token)s, %(score"+str(count)+")s, %(count"+str(count)+")s)," count += 1 if len(vals.keys()) >= 4: # Remove last comma and add rule for duplicate keys. insert_q = insert_q[:len(insert_q)-1] + " ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count);" # EXECUTE QUERY self.sql.q(insert_q,vals) # move high-scoring words over celeb_tfidf q = "INSERT INTO celeb_tfidf (SELECT * FROM celeb_tfidf_all WHERE score > 0.005) ON DUPLICATE KEY UPDATE score=VALUES(score), count=VALUES(count)" self.sql.q(q)