Beispiel #1
0
 def _getLexicalStats(messages):
     words = statsUtil.getWords(messages)
     #text = nltk.Text(words)
     tokensCount = len(words)
     vocabularyCount = len(set(words))
     if tokensCount == 0:
         lexicalRichness = 0
     else:
         lexicalRichness = vocabularyCount / tokensCount
     return tokensCount, vocabularyCount, lexicalRichness
 def _getLexicalStats(messages):
     words = statsUtil.getWords(messages)
     # text = nltk.Text(words)
     tokensCount = len(words)
     vocabularyCount = len(set(words))
     if tokensCount == 0:
         lexicalRichness = 0
     else:
         lexicalRichness = vocabularyCount / tokensCount
     return tokensCount, vocabularyCount, lexicalRichness
Beispiel #3
0
    def _generateLexicalStatsBy(self, groupByColumns=[]):
        res = self.df.rename(columns={'text':'text'})
        #enough, probably the best is to make another simpler method in statsUtil
        res = res.groupby(['sender'] + groupByColumns, as_index=False).agg(
            {'text' : lambda x: tuple(statsUtil.getWords(" ".join(x)))})
        res['tokensCount'] = res['text'].apply(lambda x: len(x))
        res['vocabularyCount'] = res['text'].apply(lambda x: len(set(x)))

        res.drop('text', axis=1, inplace=True)

        if groupByColumns:
            tot = res.groupby(groupByColumns, as_index=False).sum()
            tot['sender'] = "total"
            res = pd.concat([res, tot])
            #TODO Missing tokencount = zero case
            res['lexicalRichness'] = res['vocabularyCount']/res['tokensCount']
            return res
        else:
            res.set_index(['sender'], inplace=True)
            res.loc['total'] = res.sum()
            res['lexicalRichness'] = res['vocabularyCount']/res['tokensCount']
            return res[['tokensCount', 'vocabularyCount', 'lexicalRichness']]
    def _computeWordsCount(msgs, groupByColumns):
        """
        Generates dataframe with words count for each group-by entry.
        Grouping is done on passed columns plus the sender one.
        """

        # Group messages by sender and specified feature, concatenating text field
        grouped_msgs = msgs.groupby(groupByColumns).agg(
            {'text': lambda x: " ".join(x)})

        # Count-vectorize msgs, using own defined analyzer (tokenizer)
        vectorizer = CountVectorizer(analyzer=lambda x: statsUtil.getWords(x))
        X = vectorizer.fit_transform(grouped_msgs['text'].values)

        # Create count matrix using words as columns
        countMatrix = pd.DataFrame(X.toarray(),
                                   index=grouped_msgs.index,
                                   columns=vectorizer.get_feature_names())

        # Join data while dropping text column
        wordsCount = grouped_msgs.drop('text', axis=1).join(countMatrix)

        return wordsCount