예제 #1
0
    def __init__(self,votes):

        yesWordsDic = {}
        noWordsDic = {}


        for name,group in votes.groupby('Vote'):
            speeches = group['Discurso']
            concatSpeeches = ''
            for speech in speeches:
                concatSpeeches += ' ' + sanitizeString(str(speech))

            nWord = len(concatSpeeches.split())
            mostCommonWords = Counter(concatSpeeches.split()).most_common()
            for i in range(0,len(mostCommonWords)):
                if name== True:
                    yesWordsDic[mostCommonWords[i][0]] = mostCommonWords[i][1] / (nWord)
                else:
                    noWordsDic[mostCommonWords[i][0]]  = mostCommonWords[i][1] / (nWord)

        self.wordRelativeFreqDic = yesWordsDic.copy()

        for pair in noWordsDic.items():
            if pair[0] in self.wordRelativeFreqDic:
                self.wordRelativeFreqDic[pair[0]] = self.wordRelativeFreqDic[pair[0]] - pair[1]
            else:
                self.wordRelativeFreqDic[pair[0]] = -pair[1]
def getNameList():

    file = open(NameListPath,'r')

    names = set()
    for line in file:
        lineNames =[sanitizeString(name) for name in line.strip().split()]

        for name in lineNames:
            if len(name)>2 and name not in ['ainda','apelido','bem','boa','boas','brasil','brasileiro','casado','das','dias','dos','espirito','filho','filha','guerra','ida','inocencia','lia','luz','nobre','pai','paz','rio','sao']:
                names.add(name)

    return list(names)

# names = getNameList()
# print(len(names))
# for name in sorted(names):
#     print(name)
def getNameList():

    file = open(NameListPath, 'r')

    names = set()
    for line in file:
        lineNames = [sanitizeString(name) for name in line.strip().split()]

        for name in lineNames:
            if len(name) > 2 and name not in [
                    'ainda', 'apelido', 'bem', 'boa', 'boas', 'brasil',
                    'brasileiro', 'casado', 'das', 'dias', 'dos', 'espirito',
                    'filho', 'filha', 'guerra', 'ida', 'inocencia', 'lia',
                    'luz', 'nobre', 'pai', 'paz', 'rio', 'sao'
            ]:
                names.add(name)

    return list(names)


# names = getNameList()
# print(len(names))
# for name in sorted(names):
#     print(name)
def calculateLexicalDiversity(text):
    textSplit = sanitizeString(text).split()

    return len(set(textSplit))/len(textSplit)
예제 #5
0
def calculateAverageWordSize(text):
    textSplit = sanitizeString(text).split()

    return average([len(word) for word in textSplit])
예제 #6
0
def getWordPresence(df, word):
    return df['Discurso'].map(lambda d: word in sanitizeString(str(d)))
예제 #7
0
def checkProperNamePresence(text, namesList):
    wordPresence = [word in namesList for word in sanitizeString(text).split()]

    return True in wordPresence
def getSpeechSize(df):
    return df['Discurso'].map(lambda d: float(len(sanitizeString(str(d)))))
예제 #9
0
from DataGathering.getVoteData import getVoteData
from DataGathering.sanitizeString import sanitizeString

# https://github.com/amueller/word_cloud
from wordcloud import WordCloud

import matplotlib.pyplot as plt

votes = getVoteData()

for name, group in votes.groupby('Vote'):
    print(name)
    speeches = group['Discurso']
    concatSpeeches = ''
    for speech in speeches:
        concatSpeeches += ' ' + sanitizeString(str(speech))

    nWord = len(concatSpeeches.split())
    mostCommonWords = Counter(concatSpeeches.split()).most_common()
    for i in range(0, 20):
        print(mostCommonWords[i][0], mostCommonWords[i][1] / (nWord))

    wordcloud = WordCloud(max_font_size=40,
                          relative_scaling=.5,
                          background_color='white',
                          max_words=50).generate(
                              concatSpeeches.replace('nao',
                                                     '').replace('sim', ''))

    plt.figure()
    plt.imshow(wordcloud)
def getSpeechSize(df):
    return df['Discurso'].map(lambda d: float(len(sanitizeString(str(d)))))
def checkProperNamePresence(text,namesList):
    wordPresence = [word in namesList for word in sanitizeString(text).split()]

    return True in wordPresence