Ejemplo n.º 1
0
 def __init__(self):
     self.nlp = NLP()
     self.txt_utils = TextUtils()
     self.usr_words = {}     # to temporary store users with tagged words
     self.users = {}
     self.result = None
Ejemplo n.º 2
0
class TweetAnalyser(BaseAnalyser):

    def __init__(self):
        self.nlp = NLP()
        self.txt_utils = TextUtils()
        self.usr_words = {}     # to temporary store users with tagged words
        self.users = {}
        self.result = None

    def add_users(self, users):
        for u in users:
            # store tagged tweets of the current user
            if u.tweets:
                self.users[u.id] = u.screen_name
                self.usr_words[u.id] = self.get_tagged_words(u.tweets, ['N'])

    def get_tagged_words(self, tweets, tags):
        """ Return a list of all the tagged words of tweets
            tweets:     list of TwitterMessage objects
                        [TwitterMessage, TwitterMessage]
            tags :      list with tag that must be filtered on
                        ['N', 'V']
        """
        words = []
        for tweet in tweets:
            tagged_tweet = self.tag_tweet(tweet)
            if tagged_tweet:
                for tagged_word in tagged_tweet:
                    if tagged_word[1] in tags:
                        words.append(tagged_word)
        return words

    def tag_tweet(self, tweet):
        if self.nlp.detect_language(tweet.text):
            return self.nlp.tag(self.txt_utils.tokenize(tweet.text), self.nlp.detect_language(tweet.text))
        return None

    def analyse(self):
        word_count = {}
        word_users = {} # contains a list of tuples (usr, timesused)

        data = {}

        # Build word_count and word_users
        data['message'] = "Building word_count and word_users"
        self.notifyAll(data)
        for u_id in self.usr_words:
            word_fd = nltk.FreqDist(word for (word, tag) in self.usr_words[u_id])
            for word in word_fd:
                if word_count.has_key(word):
                    word_count[word] += word_fd[word]
                    word_users[word].append((u_id, word_fd[word]))
                else:
                    word_count[word] = word_fd[word]
                    word_users[word] = [(u_id, word_fd[word])]

        data['message'] = "Filtering word_count and word_users on the times a word is used by a user in comparison with the other users of this word."
        self.notifyAll(data)
        # Filter the users of a word on the times a word is used
        for word in word_count:
            avg_usg = word_count[word]/float(len(word_users[word]))
            lower_limit = avg_usg - 0.25 * avg_usg
            for i, user in enumerate(word_users[word]):
                if user[1] < lower_limit:
                    word_count[word] -= user[1]


        clusters = []
        words = word_count.keys()
        data['message'] = "Comparing word users and clusters words and users if the group of users of both word match enough with each other."
        self.notifyAll(data)
        # Compare word_users and if they are similar combine to a clusters
        for i in range(len(words)):
            cluster_words = [words[i]]
            cluster_users = []
            users_a = [user[0] for user in word_users[words[i]]]
            cluster_users.extend(users_a)

            # Now compare the users of word[i] with the users of all other words
            for j in range((i+1), len(words)):
                users_b = [user[0] for user in word_users[words[j]]]
                intersect_len = len(set(users_a).intersection(set(users_b)))

                # Check if user groups of two words are very similar
                if intersect_len < 0.75 * len(users_a) or intersect_len < 0.75 * len(users_b):
                    continue
                # They are very similar
                cluster_words.append(words[j])
                cluster_users = set(cluster_users).union(users_b)

            # We don't want clusters with one word, and especially not clusters with one user.
            if len(cluster_users) == 1 or len(cluster_words) == 1:
                continue

            # Check if the cluster is not a subcluster (of words) of a previous cluster. Then we can skip the subcluster
            in_previous_cluster = False
            for cluster in clusters:
                if len(set(cluster_words)) == len(set(cluster[0]).intersection(set(cluster_words))):
                    in_previous_cluster = True
                    break
            if in_previous_cluster:
                continue

            # Everything ok, so replace user ids with screennames in result
            screennames = []
            for user in cluster_users:
                screennames.append("@" + self.users[user])

            # Save found cluster
            data['message'] = self.cluster_to_string((cluster_words, screennames))
            self.notifyAll(data)
            clusters.append((cluster_words, screennames))

        # Sort by users per cluster
        sorted_clusters = sorted(clusters, cmp=lambda x,y: cmp(len(x[1]), len(y[1])), reverse=True)
        self.result = sorted_clusters
        return sorted_clusters

    def cluster_to_string(self, cluster):
        r = ""
        for word in cluster[0]:
            r+= " " + word
        r+="\n"

        for username in cluster[1]:
            r += " " + str(username)
        r+="\n"
        return r

    def result_to_string(self):
        """Returns a printable version of the results"""
        r = ""
        for cluster in self.result:
            r += self.cluster_to_string(cluster) + "\n"
        return r