def __init__(self,votes): yesWordsDic = {} noWordsDic = {} for name,group in votes.groupby('Vote'): speeches = group['Discurso'] concatSpeeches = '' for speech in speeches: concatSpeeches += ' ' + sanitizeString(str(speech)) nWord = len(concatSpeeches.split()) mostCommonWords = Counter(concatSpeeches.split()).most_common() for i in range(0,len(mostCommonWords)): if name== True: yesWordsDic[mostCommonWords[i][0]] = mostCommonWords[i][1] / (nWord) else: noWordsDic[mostCommonWords[i][0]] = mostCommonWords[i][1] / (nWord) self.wordRelativeFreqDic = yesWordsDic.copy() for pair in noWordsDic.items(): if pair[0] in self.wordRelativeFreqDic: self.wordRelativeFreqDic[pair[0]] = self.wordRelativeFreqDic[pair[0]] - pair[1] else: self.wordRelativeFreqDic[pair[0]] = -pair[1]
def getNameList(): file = open(NameListPath,'r') names = set() for line in file: lineNames =[sanitizeString(name) for name in line.strip().split()] for name in lineNames: if len(name)>2 and name not in ['ainda','apelido','bem','boa','boas','brasil','brasileiro','casado','das','dias','dos','espirito','filho','filha','guerra','ida','inocencia','lia','luz','nobre','pai','paz','rio','sao']: names.add(name) return list(names) # names = getNameList() # print(len(names)) # for name in sorted(names): # print(name)
def getNameList(): file = open(NameListPath, 'r') names = set() for line in file: lineNames = [sanitizeString(name) for name in line.strip().split()] for name in lineNames: if len(name) > 2 and name not in [ 'ainda', 'apelido', 'bem', 'boa', 'boas', 'brasil', 'brasileiro', 'casado', 'das', 'dias', 'dos', 'espirito', 'filho', 'filha', 'guerra', 'ida', 'inocencia', 'lia', 'luz', 'nobre', 'pai', 'paz', 'rio', 'sao' ]: names.add(name) return list(names) # names = getNameList() # print(len(names)) # for name in sorted(names): # print(name)
def calculateLexicalDiversity(text): textSplit = sanitizeString(text).split() return len(set(textSplit))/len(textSplit)
def calculateAverageWordSize(text): textSplit = sanitizeString(text).split() return average([len(word) for word in textSplit])
def getWordPresence(df, word): return df['Discurso'].map(lambda d: word in sanitizeString(str(d)))
def checkProperNamePresence(text, namesList): wordPresence = [word in namesList for word in sanitizeString(text).split()] return True in wordPresence
def getSpeechSize(df): return df['Discurso'].map(lambda d: float(len(sanitizeString(str(d)))))
from DataGathering.getVoteData import getVoteData from DataGathering.sanitizeString import sanitizeString # https://github.com/amueller/word_cloud from wordcloud import WordCloud import matplotlib.pyplot as plt votes = getVoteData() for name, group in votes.groupby('Vote'): print(name) speeches = group['Discurso'] concatSpeeches = '' for speech in speeches: concatSpeeches += ' ' + sanitizeString(str(speech)) nWord = len(concatSpeeches.split()) mostCommonWords = Counter(concatSpeeches.split()).most_common() for i in range(0, 20): print(mostCommonWords[i][0], mostCommonWords[i][1] / (nWord)) wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white', max_words=50).generate( concatSpeeches.replace('nao', '').replace('sim', '')) plt.figure() plt.imshow(wordcloud)
def checkProperNamePresence(text,namesList): wordPresence = [word in namesList for word in sanitizeString(text).split()] return True in wordPresence