def __init__(self): self.nlp = NLP() self.txt_utils = TextUtils() self.usr_words = {} # to temporary store users with tagged words self.users = {} self.result = None
class TweetAnalyser(BaseAnalyser): def __init__(self): self.nlp = NLP() self.txt_utils = TextUtils() self.usr_words = {} # to temporary store users with tagged words self.users = {} self.result = None def add_users(self, users): for u in users: # store tagged tweets of the current user if u.tweets: self.users[u.id] = u.screen_name self.usr_words[u.id] = self.get_tagged_words(u.tweets, ['N']) def get_tagged_words(self, tweets, tags): """ Return a list of all the tagged words of tweets tweets: list of TwitterMessage objects [TwitterMessage, TwitterMessage] tags : list with tag that must be filtered on ['N', 'V'] """ words = [] for tweet in tweets: tagged_tweet = self.tag_tweet(tweet) if tagged_tweet: for tagged_word in tagged_tweet: if tagged_word[1] in tags: words.append(tagged_word) return words def tag_tweet(self, tweet): if self.nlp.detect_language(tweet.text): return self.nlp.tag(self.txt_utils.tokenize(tweet.text), self.nlp.detect_language(tweet.text)) return None def analyse(self): word_count = {} word_users = {} # contains a list of tuples (usr, timesused) data = {} # Build word_count and word_users data['message'] = "Building word_count and word_users" self.notifyAll(data) for u_id in self.usr_words: word_fd = nltk.FreqDist(word for (word, tag) in self.usr_words[u_id]) for word in word_fd: if word_count.has_key(word): word_count[word] += word_fd[word] word_users[word].append((u_id, word_fd[word])) else: word_count[word] = word_fd[word] word_users[word] = [(u_id, word_fd[word])] data['message'] = "Filtering word_count and word_users on the times a word is used by a user in comparison with the other users of this word." self.notifyAll(data) # Filter the users of a word on the times a word is used for word in word_count: avg_usg = word_count[word]/float(len(word_users[word])) lower_limit = avg_usg - 0.25 * avg_usg for i, user in enumerate(word_users[word]): if user[1] < lower_limit: word_count[word] -= user[1] clusters = [] words = word_count.keys() data['message'] = "Comparing word users and clusters words and users if the group of users of both word match enough with each other." self.notifyAll(data) # Compare word_users and if they are similar combine to a clusters for i in range(len(words)): cluster_words = [words[i]] cluster_users = [] users_a = [user[0] for user in word_users[words[i]]] cluster_users.extend(users_a) # Now compare the users of word[i] with the users of all other words for j in range((i+1), len(words)): users_b = [user[0] for user in word_users[words[j]]] intersect_len = len(set(users_a).intersection(set(users_b))) # Check if user groups of two words are very similar if intersect_len < 0.75 * len(users_a) or intersect_len < 0.75 * len(users_b): continue # They are very similar cluster_words.append(words[j]) cluster_users = set(cluster_users).union(users_b) # We don't want clusters with one word, and especially not clusters with one user. if len(cluster_users) == 1 or len(cluster_words) == 1: continue # Check if the cluster is not a subcluster (of words) of a previous cluster. Then we can skip the subcluster in_previous_cluster = False for cluster in clusters: if len(set(cluster_words)) == len(set(cluster[0]).intersection(set(cluster_words))): in_previous_cluster = True break if in_previous_cluster: continue # Everything ok, so replace user ids with screennames in result screennames = [] for user in cluster_users: screennames.append("@" + self.users[user]) # Save found cluster data['message'] = self.cluster_to_string((cluster_words, screennames)) self.notifyAll(data) clusters.append((cluster_words, screennames)) # Sort by users per cluster sorted_clusters = sorted(clusters, cmp=lambda x,y: cmp(len(x[1]), len(y[1])), reverse=True) self.result = sorted_clusters return sorted_clusters def cluster_to_string(self, cluster): r = "" for word in cluster[0]: r+= " " + word r+="\n" for username in cluster[1]: r += " " + str(username) r+="\n" return r def result_to_string(self): """Returns a printable version of the results""" r = "" for cluster in self.result: r += self.cluster_to_string(cluster) + "\n" return r