Esempio n. 1
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        # TODO: prepare the data
        data = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL <s> and the END SYMBOL </s>
        for s in train_data:
            data.append(("<s>", s[0][1]))
            for i in range(len(s) - 1):
                data.append((s[i][1], s[i + 1][1]))
            data.append((s[len(s) - 1][1], "</s>"))

        # TODO compute the transition model
        transition_FD = ConditionalFreqDist(data)
        estimator = lambda f: nltk.LidstoneProbDist(f, 0.01, f.B() + 1)
        self.transition_PD = ConditionalProbDist(transition_FD, estimator)

        return self.transition_PD
Esempio n. 2
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        #raise NotImplementedError('HMM.emission_model')
        # TODO prepare data

        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        data = []
        for s in train_data:
            for (word, tag) in s:
                data.append((tag, word.lower()))

        # TODO compute the emission model
        emission_FD = nltk.ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.emission_PD = nltk.ConditionalProbDist(emission_FD, lidstone_estimator)
        self.states = list(set([ tag for (tag, word) in data]))

        return self.emission_PD, self.states
Esempio n. 3
0
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        #raise NotImplementedError('HMM.transition_model')
        # TODO: prepare the data
        data = []

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        padded_data = []
        for s in train_data:
            padded_data.append([('<s>','<s>')] + s + [('</s>','</s>')])  # TODO

        tagGenerators=(((s[i][1],s[i+1][1]) for i in range(len(s)-1)) for s in padded_data)
        data = list(itertools.chain.from_iterable(tagGenerators))



        # TODO compute the transition model

        transition_FD = nltk.ConditionalFreqDist(data)
        lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1)
        self.transition_PD = nltk.ConditionalProbDist(transition_FD, lidstone_estimator)

        return self.transition_PD
Esempio n. 4
0
def train_word_lm_lidstone(dataset, n=2, gamma=0.01):
    lidstone_estimator = lambda fd: nltk.LidstoneProbDist(
        fd, gamma,
        fd.B() + 100)
    model = NgramModel(n,
                       dataset,
                       smoothing=True,
                       estimator=lidstone_estimator)
    return model
Esempio n. 5
0
 def getLidstoneTag(self, tagged_corpus):
     """
     Converts the provided frequency distribution into a Lidstone estimation
     using the nltk.probability module.  Gamma=1, bins=None is Laplace.
     This estimation is used to decide which tag to assign given the tag-
     context observed in the test tagged corpus. Creates necessary fd from
     training tagged corpus.
     """
     fd = self.getNGramTagFD(tagged_corpus)
     td_ngram = nltk.LidstoneProbDist(fd, self._gamma, bins=self._bins)
     return td_ngram
Esempio n. 6
0
    def generate(self, length=100):
        """"""
        # Change tokens
        self.tokens = nltk.word_tokenize(
            self.__words[randint(1, len(self.__words)) - 1])

        estimator = lambda fdist, bins: nltk.LidstoneProbDist(
            fdist, self.__random.random())
        #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2)
        self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15),
                                              self, estimator)
        #self._trigram_model = nltk.NgramModel(3, self, estimator)
        text = self._trigram_model.generate(length)
        return nltk.tokenwrap(text)
Esempio n. 7
0
def train():
    print 'Training HMM...'

    # Use the first 1000 sentences from the 'news' category of the Brown corpus
    labelled_sequences, states, symbols = get_pos_data(1000)

    # Define the estimator to be used for probability computation
    estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins)

    # count occurences of starting states, transitions out of each state
    # and output symbols observed in each state
    freq_starts = nltk.FreqDist()
    freq_transitions = nltk.ConditionalFreqDist()
    freq_emissions = nltk.ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[1]
            symbol = token[0]
            if lasts == None:
                freq_starts.inc(state)
            else:
                freq_transitions[lasts].inc(state)
            freq_emissions[state].inc(symbol)
            lasts = state

            # update the state and symbol lists
            if state not in states:
                states.append(state)
            if symbol not in symbols:
                symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(states)
    starts = estimator(freq_starts, N)
    transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N)
    emissions = nltk.ConditionalProbDist(freq_emissions, estimator,
                                         len(symbols))

    # Return the transition and emissions probabilities along with
    # the list of all the states and output symbols
    return starts, transitions, emissions, states, symbols
Esempio n. 8
0
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """

        # TODO prepare data
        data = []
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        for sent in train_data:
            data += [(tag, word.lower()) for (word, tag) in sent]

        # TODO compute the emission model
        emission_FD = ConditionalFreqDist(data)
        estimator = lambda f: nltk.LidstoneProbDist(f, 0.01, f.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, estimator)
        self.states = [s for s in self.emission_PD.keys()]

        return self.emission_PD, self.states
Esempio n. 9
0
def word_suavizacao(all_words):
    # all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words))
    all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.01)
    return list(all_words.samples())
Esempio n. 10
0
 def LidProDist(freqDist):
     return nltk.LidstoneProbDist(freqDist, 0.01, freqDist.B() + 1)
Esempio n. 11
0
def LidstoneProbDistFactory(freqdist):
    return nltk.LidstoneProbDist(freqdist, .01, freqdist.B() + 1)
Esempio n. 12
0
for sent in brown_tags:
    sub_temp = []
    for tag in sent:
        try:
            sub_temp.append(nltk.tag.simplify_tag(tag))
        except IndexError: pass
    temp.append(sub_temp)

fd = nltk.FreqDist()
for sent in temp:
    if len(sent)>2:
        temp_tri = nltk.trigrams(sent)
        for entry in temp_tri:
            fd.inc(entry)
            
tfd = nltk.LidstoneProbDist(fd, gamma=1)
tfd_avg = sum(tfd.prob(sample) for sample in \
              tfd.samples()) / float(len(tfd.samples()))

raw = urlopen(url).read()
raw = nltk.clean_html(raw)
raw = raw.replace('\r\n', '')
raw = raw.replace('\n', '')
raw = raw.replace('\\', '')
raw = raw.lower()
for key in contraction_dict.keys():
    raw = raw.replace(key, contraction_dict[key])
sents_tok = nltk.sent_tokenize(raw)

matchVec = []
for sent in sents_tok:
Esempio n. 13
0
 def makeNGrams(self, k=0, print_out=True):
     self.grams = nltk.ngrams(self.corpus, self.n_gram)
     gfreq = nltk.FreqDist(self.grams)
     self.pdf = nltk.LidstoneProbDist(gfreq, k)
     print("N-grams ready!")
Esempio n. 14
0
def executar(experimento,nome_Base,acento):
    nomeBase = nome_Base
    path = experimento+nomeBase
    # print('executando:\n'+path)
    # print('Sem acento:\n'+('Sim' if(acento) else 'Não'))

    base = readBase(nomeBase)
    tamBase = len(base)
    i=0
    documents = []
    #print base[0][0].split()
    tknzr = nltk.tokenize.TweetTokenizer()

    while (i<tamBase):
        if(acento):
            w = [q.lower() for q in remocaoacento(tknzr.tokenize(base[i][0]))]
        else:
            w = [q.lower() for q in tknzr.tokenize(base[i][0])]
        w = remocaopontos(w)
        conteudoLista = (w,base[i][1])
        documents.append(conteudoLista)
        i += 1

    ################################ Pre Processamento
    stopwords = nltk.corpus.stopwords.words('portuguese')

    stemmer = nltk.stem.RSLPStemmer()

    # h=0
    # j=len(documents)
    # while (h<j):
    #    g=len(documents[h][0])
    #    f=0
    #    while(f<g):
    #        stemmer.stem(documents[h][0][f])
    #        f+=1
    #    h += 1

    ################################

    random.shuffle(documents)

    all_words = []

    k=0
    l=len(documents)
    while (k<l):
        m=len(documents[k][0])
        n=0
        while(n<m):
            all_words.append(documents[k][0][n])
            n+=1
        k += 1
    # all_words = remocaopontos(all_words)
    
    all_words = [w.lower() for w in all_words if w not in stopwords]
    # print(str(all_words))

    #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras
    #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1)
    #all_words = nltk.WittenBellProbDist(nltk.FreqDist(all_words))
    #nltk.WittenBellProbDist() procurar como mudar o ngram
    #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords))

    all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1)
    #all_words = nltk.FreqDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords))

    word_features = list(all_words.samples()) #se usando FreqDistlista com palavras que aparecem mais de 3000
    
    # word_features =nltk.LidstoneProbDist(nltk.FreqDist(word_features), 0.1)
    # word_features = word_features.samples()
    #word_features = list(all_words.keys())
    '''aqui que modifiquei
    def find_features(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features
    '''
    #aquii
    
    def wordbigram(word_feature):
        bigram =[]
        i=0
        l = len(word_feature)-1
        while (i<l):
            # if ((not word_feature[i] in stopwords) or (not word_feature[i+1]in stopwords)):
            s = tuple([stemmer.stem(word_feature[i]),stemmer.stem(word_feature[i+1])])
            bigram.append(s)
            i+=1
        return bigram

    def removerpalavras(todas_palavras,document):
        #remover as palavras que não estãoem todas as palavras
        linha = []
        for w in document:
            if(w in todas_palavras):
                linha.append(w)
        return linha

    def wordFeature(documents):
        #cria um dicionario de dados
        dicionario = []
        for w in documents:
            for q in w[0]:
                if(not q in dicionario):
                    dicionario.append(q)   
        return dicionario

    documents = [[removerpalavras(all_words.samples(),w[0]),w[1]] for w in documents]
    documents = [[wordbigram(w[0]),w[1]] for w in documents]
    word_features = wordFeature(documents) #se 0usando FreqDistlista com palavras que aparecem mais de 3000
    # print(str(len(word_features)))
    # exit()
    # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000
    
    def find_features(document):
        # words = set(document)
        features = {}
        i=0
        l = len(word_features)
        while(i<l):
            features[str(i)] = (word_features[i] in document)
            i+=1
        return features
    featuresets = [(find_features(rev), category) for (rev, category) in documents]
    

    kfold = 4

    baseInteira = featuresets

    tamT = len(featuresets)
    divisao = tamT//kfold

    ###### ajustar divisao
    baseDividida1 = featuresets[0:divisao]
    baseDividida2 = featuresets[divisao:(divisao*2)]
    baseDividida3 = featuresets[(divisao*2):(divisao*3)]
    baseDividida4 = featuresets[(divisao*3):tamT]

    #tamT = len(featuresets)
    #umQuarto = tamBase/4

    #training_set = featuresets[umQuarto:]
    #testing_set = featuresets[:umQuarto]

    #training_set = featuresets[100:]
    #testing_set = featuresets[0:100]

    ########################## 1 rodada
    #print "## RODADA 1 ##"

    training_set = baseDividida2+baseDividida3+baseDividida4
    testing_set = baseDividida1

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    MNBpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoMNB1 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPMNB1 = 0
    while(g<len(precisaoMNB1)):
        somaPMNB1 = somaPMNB1+precisaoMNB1[g]
        g=g+1
    MNBpt1 = (somaPMNB1/len(precisaoMNB1))*100
    MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRMNB1 = 0
    while(g<len(recallMNB1)):
        somaRMNB1 = somaRMNB1+recallMNB1[g]
        g=g+1
    MNBrt1 = (somaRMNB1/len(recallMNB1))*100
    MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFMNB1 = 0
    while(g<len(f1MNB1)):
        somaFMNB1 = somaFMNB1+f1MNB1[g]
        g=g+1
    MNBft1 = (somaFMNB1/len(f1MNB1))*100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Rpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoR1 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPR1 = 0
    while(g<len(precisaoR1)):
        somaPR1 = somaPR1+precisaoR1[g]
        g=g+1
    Rpt1 = (somaPR1/len(precisaoR1))*100
    Rrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRR1 = 0
    while(g<len(recallR1)):
        somaRR1 = somaRR1+recallR1[g]
        g=g+1
    Rrt1 = (somaRR1/len(recallR1))*100
    Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFR1 = 0
    while(g<len(f1R1)):
        somaFR1 = somaFR1+f1R1[g]
        g=g+1
    Rft1 = (somaFR1/len(f1R1))*100

    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Lpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoL1 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPL1 = 0
    while(g<len(precisaoL1)):
        somaPL1 = somaPL1+precisaoL1[g]
        g=g+1
    Lpt1 = (somaPL1/len(precisaoL1))*100
    Lrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRL1 = 0
    while(g<len(recallL1)):
        somaRL1 = somaRL1+recallL1[g]
        g=g+1
    Lrt1 = (somaRL1/len(recallL1))*100
    Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFL1 = 0
    while(g<len(f1L1)):
        somaFL1 = somaFL1+f1L1[g]
        g=g+1
    Lft1 = (somaFL1/len(f1L1))*100

    ######################## Rodada 2
    #print "## RODADA 2 ##"

    training_set = baseDividida1+baseDividida3+baseDividida4
    testing_set = baseDividida2

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    MNBpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoMNB2 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPMNB2 = 0
    while(g<len(precisaoMNB2)):
        somaPMNB2 = somaPMNB2+precisaoMNB2[g]
        g=g+1
    MNBpt2 = (somaPMNB2/len(precisaoMNB2))*100
    MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRMNB2 = 0
    while(g<len(recallMNB2)):
        somaRMNB2 = somaRMNB2+recallMNB2[g]
        g=g+1
    MNBrt2 = (somaRMNB2/len(recallMNB2))*100
    MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFMNB2 = 0
    while(g<len(f1MNB2)):
        somaFMNB2 = somaFMNB2+f1MNB2[g]
        g=g+1
    MNBft2 = (somaFMNB2/len(f1MNB2))*100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Rpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoR2 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPR2 = 0
    while(g<len(precisaoR2)):
        somaPR2 = somaPR2+precisaoR2[g]
        g=g+1
    Rpt2 = (somaPR2/len(precisaoR2))*100
    Rrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRR2 = 0
    while(g<len(recallR2)):
        somaRR2 = somaRR2+recallR2[g]
        g=g+1
    Rrt2 = (somaRR2/len(recallR2))*100
    Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFR2 = 0
    while(g<len(f1R2)):
        somaFR2 = somaFR2+f1R2[g]
        g=g+1
    Rft2 = (somaFR2/len(f1R2))*100

    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Lpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoL2 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPL2 = 0
    while(g<len(precisaoL2)):
        somaPL2 = somaPL2+precisaoL2[g]
        g=g+1
    Lpt2 = (somaPL2/len(precisaoL2))*100
    Lrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRL2 = 0
    while(g<len(recallL2)):
        somaRL2 = somaRL2+recallL2[g]
        g=g+1
    Lrt2 = (somaRL2/len(recallL2))*100
    Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFL2 = 0
    while(g<len(f1L2)):
        somaFL2 = somaFL2+f1L2[g]
        g=g+1
    Lft2 = (somaFL2/len(f1L2))*100

    ##################### rodada 3
    #print "## RODADA 3 ##"

    training_set = baseDividida1+baseDividida2+baseDividida4
    testing_set = baseDividida3

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    MNBpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoMNB3 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPMNB3 = 0
    while(g<len(precisaoMNB3)):
        somaPMNB3 = somaPMNB3+precisaoMNB3[g]
        g=g+1
    MNBpt3 = (somaPMNB3/len(precisaoMNB3))*100
    MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRMNB3 = 0
    while(g<len(recallMNB3)):
        somaRMNB3 = somaRMNB3+recallMNB3[g]
        g=g+1
    MNBrt3 = (somaRMNB3/len(recallMNB3))*100
    MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFMNB3 = 0
    while(g<len(f1MNB3)):
        somaFMNB3 = somaFMNB3+f1MNB3[g]
        g=g+1
    MNBft3 = (somaFMNB3/len(f1MNB3))*100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Rpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoR3 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPR3 = 0
    while(g<len(precisaoR3)):
        somaPR3 = somaPR3+precisaoR3[g]
        g=g+1
    Rpt3 = (somaPR3/len(precisaoR3))*100
    Rrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRR3 = 0
    while(g<len(recallR3)):
        somaRR3 = somaRR3+recallR3[g]
        g=g+1
    Rrt3 = (somaRR3/len(recallR3))*100
    Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFR3 = 0
    while(g<len(f1R3)):
        somaFR3 = somaFR3+f1R3[g]
        g=g+1
    Rft3 = (somaFR3/len(f1R3))*100

    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Lpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoL3 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPL3 = 0
    while(g<len(precisaoL3)):
        somaPL3 = somaPL3+precisaoL3[g]
        g=g+1
    Lpt3 = (somaPL3/len(precisaoL3))*100
    Lrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRL3 = 0
    while(g<len(recallL3)):
        somaRL3 = somaRL3+recallL3[g]
        g=g+1
    Lrt3 = (somaRL2/len(recallL2))*100
    Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFL3 = 0
    while(g<len(f1L3)):
        somaFL3 = somaFL3+f1L3[g]
        g=g+1
    Lft3 = (somaFL3/len(f1L3))*100

    ############################ rodada 4
    #print "## RODADA 4 ##"

    training_set = baseDividida1+baseDividida2+baseDividida3
    testing_set = baseDividida4

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    MNBpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoMNB4 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPMNB4 = 0
    while(g<len(precisaoMNB4)):
        somaPMNB4 = somaPMNB4+precisaoMNB4[g]
        g=g+1
    MNBpt4 = (somaPMNB4/len(precisaoMNB4))*100
    MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRMNB4 = 0
    while(g<len(recallMNB4)):
        somaRMNB4 = somaRMNB4+recallMNB4[g]
        g=g+1
    MNBrt4 = (somaRMNB4/len(recallMNB4))*100
    MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFMNB4 = 0
    while(g<len(f1MNB4)):
        somaFMNB4 = somaFMNB4+f1MNB4[g]
        g=g+1
    MNBft4 = (somaFMNB4/len(f1MNB4))*100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Rpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoR4 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPR4 = 0
    while(g<len(precisaoR4)):
        somaPR4 = somaPR4+precisaoR4[g]
        g=g+1
    Rpt4 = (somaPR4/len(precisaoR4))*100
    Rrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRR4 = 0
    while(g<len(recallR4)):
        somaRR4 = somaRR4+recallR4[g]
        g=g+1
    Rrt4 = (somaRR4/len(recallR4))*100
    Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFR4 = 0
    while(g<len(f1R4)):
        somaFR4 = somaFR4+f1R4[g]
        g=g+1
    Rft4 = (somaFR4/len(f1R4))*100

    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100
    Lpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100
    precisaoL4 = sklearn.metrics.precision_score(testgold, testclas, average=None)
    g=0
    somaPL4 = 0
    while(g<len(precisaoL4)):
        somaPL4 = somaPL4+precisaoL4[g]
        g=g+1
    Lpt4 = (somaPL4/len(precisaoL4))*100
    Lrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100
    recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g=0
    somaRL4 = 0
    while(g<len(recallL4)):
        somaRL4 = somaRL4+recallL4[g]
        g=g+1
    Lrt4 = (somaRL4/len(recallL4))*100
    Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g=0
    somaFL4 = 0
    while(g<len(f1L4)):
        somaFL4 = somaFL4+f1L4[g]
        g=g+1
    Lft4 = (somaFL4/len(f1L4))*100


    ################# medias
    #print "## MEDIA ##"

    #MULTINOMINAL
    MNBmc = (MNBmc1+MNBmc2+MNBmc3+MNBmc4)/4
    MNBa = (MNBa1+MNBa2+MNBa3+MNBa4)/4
    MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBpp = (MNBpp4+MNBpp4+MNBpp4+MNBpp4)/4
    MNBpt = (MNBpt1+MNBpt2+MNBpt3+MNBpt4)/4
    MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBrp = (MNBrp1+MNBrp2+MNBrp3+MNBrp4)/4
    MNBrt = (MNBrt1+MNBrt2+MNBrt3+MNBrt4)/4
    MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBfp = (MNBfp1+MNBfp2+MNBfp3+MNBfp4)/4
    MNBft = (MNBft1+MNBft2+MNBft3+MNBft4)/4
    MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4])
    MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4])

    '''
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_aspect('equal')
    plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean)
    plt.colorbar()
    plt.show()
    '''

    #REGRESSAO LINEAR
    Rmc = (Rmc1+Rmc2+Rmc3+Rmc4)/4
    Ra = (Ra1+Ra2+Ra3+Ra4)/4
    Ramax = max([Ra1, Ra2, Ra3, Ra4])
    Ramin = min([Ra1, Ra2, Ra3, Ra4])
    Rpp = (Rpp4+Rpp4+Rpp4+Rpp4)/4
    Rpt = (Rpt1+Rpt2+Rpt3+Rpt4)/4
    Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4])
    Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4])
    Rrp = (Rrp1+Rrp2+Rrp3+Rrp4)/4
    Rrt = (Rrt1+Rrt2+Rrt3+Rrt4)/4
    Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4])
    Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4])
    Rfp = (Rfp1+Rfp2+Rfp3+Rfp4)/4
    Rft = (Rft1+Rft2+Rft3+Rft4)/4
    Rfmax = max([Rft1, Rft2, Rft3, Rft4])
    Rfmin = min([Rft1, Rft2, Rft3, Rft4])

    #SVC LINEAR
    Lmc = (Lmc1+Lmc2+Lmc3+Lmc4)/4
    La = (La1+La2+La3+La4)/4
    Lamax = max([La1, La2, La3, La4])
    Lamin = min([La1, La2, La3, La4])
    Lpp = (Lpp4+Lpp4+Lpp4+Lpp4)/4
    Lpt = (Lpt1+Lpt2+Lpt3+Lpt4)/4
    Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4])
    Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4])
    Lrp = (Lrp1+Lrp2+Lrp3+Lrp4)/4
    Lrt = (Lrt1+Lrt2+Lrt3+Lrt4)/4
    Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4])
    Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4])
    Lfp = (Lfp1+Lfp2+Lfp3+Lfp4)/4
    Lft = (Lft1+Lft2+Lft3+Lft4)/4
    Lfmax = max([Lft1, Lft2, Lft3, Lft4])
    Lfmin = min([Lft1, Lft2, Lft3, Lft4])
    '''
    print "SVC Linear"
    print "Matriz de confusão: ", Lmc
    print "Acuracia: ", La
    print "Precisão parcial: ", Lpp
    print "Precisão total: ", Lpt
    print "Recall parcial: ", Lrp
    print "Recall total: ", Lrt
    print "F-medida parcial: ", Lfp
    print "F-medida total: ", Lft
    '''


    print(experimento + ':' + str(MNBa) +'\t'+str(Ra)+'\t'+str(La))
    with open(path,mode='w') as csv_file:
        #writer = csv.writer(csv_file)
        csv_file.writelines('Algoritmo'+';'+'Multinominal Naïve-Bayes'+'\n')
        csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n')
        csv_file.writelines('1;'+ str(MNBa1)+';'+str(MNBpp1)+';'+str(MNBpt1)+';'+str(MNBrp1)+';'+str(MNBrt1)+';'+str(MNBfp1)+';'+str(MNBft1)+'\n')
        csv_file.writelines('2;'+ str(MNBa2)+';'+str(MNBpp2)+';'+str(MNBpt2)+';'+str(MNBrp2)+';'+str(MNBrt2)+';'+str(MNBfp2)+';'+str(MNBft2)+'\n')
        csv_file.writelines('3;'+ str(MNBa3)+';'+str(MNBpp3)+';'+str(MNBpt3)+';'+str(MNBrp3)+';'+str(MNBrt3)+';'+str(MNBfp3)+';'+str(MNBft3)+'\n')
        csv_file.writelines('4;'+ str(MNBa4)+';'+str(MNBpp4)+';'+str(MNBpt4)+';'+str(MNBrp4)+';'+str(MNBrt4)+';'+str(MNBfp4)+';'+str(MNBft4)+'\n')
        csv_file.writelines('=================='+'\n')
        csv_file.writelines('Total'+'\n')
        csv_file.writelines('Média;'+ str(MNBa)+';'+str(MNBpp)+';'+str(MNBpt)+';'+str(MNBrp)+';'+str(MNBrt)+';'+str(MNBfp)+';'+str(MNBft)+'\n')
        csv_file.writelines('Máximo;'+ str(MNBamax)+""+';'+str(MNBpmax)+""+';'+str(MNBrmax)+""+';'+str(MNBfmax)+'\n')
        csv_file.writelines('Mínimo;'+ str(MNBamin)+""+';'+str(MNBpmin)+""+';'+str(MNBrmin)+""+';'+str(MNBfmin)+'\n')
        csv_file.writelines('=================='+'\n')
        csv_file.writelines('Algoritmo'+';'+'Regressão Linear'+'\n')
        csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n')
        csv_file.writelines('1;'+ str(Ra1)+';'+str(Rpp1)+';'+str(Rpt1)+';'+str(Rrp1)+';'+str(Rrt1)+';'+str(Rfp1)+';'+str(Rft1)+'\n')
        csv_file.writelines('2;'+ str(Ra2)+';'+str(Rpp2)+';'+str(Rpt2)+';'+str(Rrp2)+';'+str(Rrt2)+';'+str(Rfp2)+';'+str(Rft2)+'\n')
        csv_file.writelines('3;'+ str(Ra3)+';'+str(Rpp3)+';'+str(Rpt3)+';'+str(Rrp3)+';'+str(Rrt3)+';'+str(Rfp3)+';'+str(Rft3)+'\n')
        csv_file.writelines('4;'+ str(Ra4)+';'+str(Rpp4)+';'+str(Rpt4)+';'+str(Rrp4)+';'+str(Rrt4)+';'+str(Rfp4)+';'+str(Rft4)+'\n')
        csv_file.writelines('=================='+'\n')
        csv_file.writelines('Total'+'\n')
        csv_file.writelines('Média;'+ str(Ra)+';'+str(Rpp)+';'+str(Rpt)+';'+str(Rrp)+';'+str(Rrt)+';'+str(Rfp)+';'+str(Rft)+'\n')
        csv_file.writelines('Máximo;'+ str(Ramax)+""+';'+str(Rpmax)+""+';'+str(Rrmax)+""+';'+str(Rfmax)+'\n')
        csv_file.writelines('Mínimo;'+ str(Ramin)+""+';'+str(Rpmin)+""+';'+str(Rrmin)+""+';'+str(Rfmin)+'\n')
        csv_file.writelines('=================='+'\n')
        csv_file.writelines('Algoritmo'+';'+'SVC Linear'+'\n')
        csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n')
        csv_file.writelines('1;'+ str(La1)+';'+str(Lpp1)+';'+str(Lpt1)+';'+str(Lrp1)+';'+str(Lrt1)+';'+str(Lfp1)+';'+str(Lft1)+'\n')
        csv_file.writelines('2;'+ str(La2)+';'+str(Lpp2)+';'+str(Lpt2)+';'+str(Lrp2)+';'+str(Lrt2)+';'+str(Lfp2)+';'+str(Lft2)+'\n')
        csv_file.writelines('3;'+ str(La3)+';'+str(Lpp3)+';'+str(Lpt3)+';'+str(Lrp3)+';'+str(Lrt3)+';'+str(Lfp3)+';'+str(Lft3)+'\n')
        csv_file.writelines('4;'+ str(La4)+';'+str(Lpp4)+';'+str(Lpt4)+';'+str(Lrp4)+';'+str(Lrt4)+';'+str(Lfp4)+';'+str(Lft4)+'\n')
        csv_file.writelines('=================='+'\n')
        csv_file.writelines('Total'+'\n')
        csv_file.writelines('Média;'+ str(La)+';'+str(Lpp)+';'+str(Lpt)+';'+str(Lrp)+';'+str(Lrt)+';'+str(Lfp)+';'+str(Lft)+'\n')
        csv_file.writelines('Máximo;'+ str(Lamax)+""+';'+str(Lpmax)+""+';'+str(Lrmax)+""+';'+str(Lfmax)+'\n')
        csv_file.writelines('Mínimo;'+ str(Lamin)+""+';'+str(Lpmin)+""+';'+str(Lrmin)+""+';'+str(Lfmin)+'\n')
Esempio n. 15
0
def executar(experimento, nome_Base, acento):
    '''
    nomeBase = nome_Base
    path = experimento+nomeBase
    print('executando:\n'+path)
    print('Sem acento:\n'+('Sim' if(acento) else 'Não'))

    base = readBase(nomeBase)
    tamBase = len(base)
    '''
    base = [
        ('  Vejam bem, maioria dos homens apoiam. maioria de mulheres não. Homens tendem a serem lógicos, mulheres emotivas. Sera que mulheres acham que rodeios é judiação de animais? O que é claramente equivocado da parte delas. Como também isso se repete na PL3722, homens pensam logicamente, e são mais favorável ao cidadão ter menos restrições a armas de fogo para proteger seu patrimônio e sua família. Ja as mães, familiares, namoradas(os), principalmente de bandidos, acham isso um terror, por que qual mãe quer ver o filho bandido morto praticando um assalto? Logo são contra o direito das vitimas de si proteger do seu filho bandido. Vote consciente, vote com razão e não emoção! Brasil melhora rapidinho. ',
         1),
        ('Observando daquí, a debandada dos derrotados. Cadê o Sacoman Keffeyo, a Ana Animais e, o pilantra do Haroldo Girafales? Perderam a coragem de virem aquí na enquete, questionar a sanção do PL?  Corvardões perdedores: vão chorar no colinho da Luisa Mell.  ',
         1),
        ('Dezenas de debates e ficou mais que provado que animais atletas nao sao animais maus tratados,parabens capitao augusto. ',
         1),
        ('PARABÉNS CAPITÃO AUGUSTO ISSO PROVA QUE QUEM AMA CUIDA,ANIMAIS TRATADOS COM MUITO CARINHO ',
         1), ('Parabéns Capitão Augusto,agora é lei.', 1)
    ]
    tamBase = len(base)
    i = 0
    documents = []
    #print base[0][0].split()
    tknzr = nltk.tokenize.TweetTokenizer()

    while (i < tamBase):
        if (acento):
            w = remocaoacento(tknzr.tokenize(base[i][0]))
        else:
            w = tknzr.tokenize(base[i][0])
        w = remocaopontos(w)
        conteudoLista = (w, base[i][1])
        documents.append(conteudoLista)
        i += 1

    ################################ Pre Processamento
    stopwords = nltk.corpus.stopwords.words('portuguese')
    #     stopwords = ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por',
    # 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já',
    # 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus',
    # 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa',
    # 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos',
    # 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas',
    # 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos',
    # 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera',
    # 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver',
    # 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera',
    # 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver',
    # 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria',
    # 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos',
    # 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos',
    # 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém',
    # 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha',
    # 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá',
    # 'teremos', 'terão', 'teria', 'teríamos', 'teriam']

    # stemmer = nltk.stem.RSLPStemmer()

    # h=0
    # j=len(documents)
    # while (h<j):
    #    g=len(documents[h][0])
    #    f=0
    #    while(f<g):
    #        stemmer.stem(documents[h][0][f])
    #        f+=1
    #    h += 1

    ################################

    random.shuffle(documents)

    all_words = []

    k = 0
    l = len(documents)
    while (k < l):
        m = len(documents[k][0])
        n = 0
        while (n < m):
            all_words.append(documents[k][0][n])
            n += 1
        k += 1
    # all_words = remocaopontos(all_words)

    all_words = [w.lower() for w in all_words if w not in stopwords]
    # print(str(all_words))

    #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras
    #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1)
    #all_words = nltk.WittenBellProbDist(nltk.FreqDist(all_words))
    #nltk.WittenBellProbDist() procurar como mudar o ngram
    #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words))
    #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords))

    all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1)
    #all_words = nltk.FreqDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords))

    word_features = list(all_words.samples(
    ))  #se usando FreqDistlista com palavras que aparecem mais de 3000
    #word_features = list(all_words.keys())
    '''aqui que modifiquei
    def find_features(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return features
    '''

    #aquii

    def wordbig(word_feature):
        words = []
        i = 0
        l = len(word_feature) - 1
        while (i < l):
            words.append(tuple([word_feature[i], word_feature[i + 1]]))
            i += 1
        return words

    def removerpalavras(todas_palavras, document):
        #remover as palavras que não estãoem todas as palavras
        linha = []
        for w in document:
            if (w in todas_palavras):
                linha.append(w)
        return linha

    def wordFeature(documents):
        #cria um dicionario de dados
        dicionario = []
        for w in documents:
            for q in w[0]:
                if (not q in dicionario):
                    dicionario.append(q)
        return dicionario

    documents = [[removerpalavras(all_words.samples(), w[0]), w[1]]
                 for w in documents]
    documents = [[wordbig(w[0]), w[1]] for w in documents]
    word_features = wordFeature(
        documents
    )  #se 0usando FreqDistlista com palavras que aparecem mais de 3000

    # print(str(len(word_features)))
    # exit()
    # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000

    def find_features(document):
        # words = set(document)
        features = {}
        i = 0
        l = len(word_features)
        while (i < l):
            features[str(word_features[i])] = (word_features[i] in document)
            i += 1
        print(str(document))
        print()
        print(str(features))
        exit()
        return features

    featuresets = [(find_features(rev), category)
                   for (rev, category) in documents]

    kfold = 4

    baseInteira = featuresets

    tamT = len(featuresets)
    divisao = tamT // kfold

    ###### ajustar divisao
    baseDividida1 = featuresets[0:divisao]
    baseDividida2 = featuresets[divisao:(divisao * 2)]
    baseDividida3 = featuresets[(divisao * 2):(divisao * 3)]
    baseDividida4 = featuresets[(divisao * 3):tamT]

    #tamT = len(featuresets)
    #umQuarto = tamBase/4

    #training_set = featuresets[umQuarto:]
    #testing_set = featuresets[:umQuarto]

    #training_set = featuresets[100:]
    #testing_set = featuresets[0:100]

    ########################## 1 rodada
    #print "## RODADA 1 ##"

    training_set = baseDividida2 + baseDividida3 + baseDividida4
    testing_set = baseDividida1

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB1 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB1 = 0
    while (g < len(precisaoMNB1)):
        somaPMNB1 = somaPMNB1 + precisaoMNB1[g]
        g = g + 1
    MNBpt1 = (somaPMNB1 / len(precisaoMNB1)) * 100
    MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB1 = 0
    while (g < len(recallMNB1)):
        somaRMNB1 = somaRMNB1 + recallMNB1[g]
        g = g + 1
    MNBrt1 = (somaRMNB1 / len(recallMNB1)) * 100
    MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB1 = 0
    while (g < len(f1MNB1)):
        somaFMNB1 = somaFMNB1 + f1MNB1[g]
        g = g + 1
    MNBft1 = (somaFMNB1 / len(f1MNB1)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR1 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR1 = 0
    while (g < len(precisaoR1)):
        somaPR1 = somaPR1 + precisaoR1[g]
        g = g + 1
    Rpt1 = (somaPR1 / len(precisaoR1)) * 100
    Rrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR1 = 0
    while (g < len(recallR1)):
        somaRR1 = somaRR1 + recallR1[g]
        g = g + 1
    Rrt1 = (somaRR1 / len(recallR1)) * 100
    Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR1 = 0
    while (g < len(f1R1)):
        somaFR1 = somaFR1 + f1R1[g]
        g = g + 1
    Rft1 = (somaFR1 / len(f1R1)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp1 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL1 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL1 = 0
    while (g < len(precisaoL1)):
        somaPL1 = somaPL1 + precisaoL1[g]
        g = g + 1
    Lpt1 = (somaPL1 / len(precisaoL1)) * 100
    Lrp1 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL1 = 0
    while (g < len(recallL1)):
        somaRL1 = somaRL1 + recallL1[g]
        g = g + 1
    Lrt1 = (somaRL1 / len(recallL1)) * 100
    Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL1 = 0
    while (g < len(f1L1)):
        somaFL1 = somaFL1 + f1L1[g]
        g = g + 1
    Lft1 = (somaFL1 / len(f1L1)) * 100

    ######################## Rodada 2
    #print "## RODADA 2 ##"

    training_set = baseDividida1 + baseDividida3 + baseDividida4
    testing_set = baseDividida2

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB2 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB2 = 0
    while (g < len(precisaoMNB2)):
        somaPMNB2 = somaPMNB2 + precisaoMNB2[g]
        g = g + 1
    MNBpt2 = (somaPMNB2 / len(precisaoMNB2)) * 100
    MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB2 = 0
    while (g < len(recallMNB2)):
        somaRMNB2 = somaRMNB2 + recallMNB2[g]
        g = g + 1
    MNBrt2 = (somaRMNB2 / len(recallMNB2)) * 100
    MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB2 = 0
    while (g < len(f1MNB2)):
        somaFMNB2 = somaFMNB2 + f1MNB2[g]
        g = g + 1
    MNBft2 = (somaFMNB2 / len(f1MNB2)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR2 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR2 = 0
    while (g < len(precisaoR2)):
        somaPR2 = somaPR2 + precisaoR2[g]
        g = g + 1
    Rpt2 = (somaPR2 / len(precisaoR2)) * 100
    Rrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR2 = 0
    while (g < len(recallR2)):
        somaRR2 = somaRR2 + recallR2[g]
        g = g + 1
    Rrt2 = (somaRR2 / len(recallR2)) * 100
    Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR2 = 0
    while (g < len(f1R2)):
        somaFR2 = somaFR2 + f1R2[g]
        g = g + 1
    Rft2 = (somaFR2 / len(f1R2)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp2 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL2 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL2 = 0
    while (g < len(precisaoL2)):
        somaPL2 = somaPL2 + precisaoL2[g]
        g = g + 1
    Lpt2 = (somaPL2 / len(precisaoL2)) * 100
    Lrp2 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL2 = 0
    while (g < len(recallL2)):
        somaRL2 = somaRL2 + recallL2[g]
        g = g + 1
    Lrt2 = (somaRL2 / len(recallL2)) * 100
    Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL2 = 0
    while (g < len(f1L2)):
        somaFL2 = somaFL2 + f1L2[g]
        g = g + 1
    Lft2 = (somaFL2 / len(f1L2)) * 100

    ##################### rodada 3
    #print "## RODADA 3 ##"

    training_set = baseDividida1 + baseDividida2 + baseDividida4
    testing_set = baseDividida3

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB3 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB3 = 0
    while (g < len(precisaoMNB3)):
        somaPMNB3 = somaPMNB3 + precisaoMNB3[g]
        g = g + 1
    MNBpt3 = (somaPMNB3 / len(precisaoMNB3)) * 100
    MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB3 = 0
    while (g < len(recallMNB3)):
        somaRMNB3 = somaRMNB3 + recallMNB3[g]
        g = g + 1
    MNBrt3 = (somaRMNB3 / len(recallMNB3)) * 100
    MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB3 = 0
    while (g < len(f1MNB3)):
        somaFMNB3 = somaFMNB3 + f1MNB3[g]
        g = g + 1
    MNBft3 = (somaFMNB3 / len(f1MNB3)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR3 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR3 = 0
    while (g < len(precisaoR3)):
        somaPR3 = somaPR3 + precisaoR3[g]
        g = g + 1
    Rpt3 = (somaPR3 / len(precisaoR3)) * 100
    Rrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR3 = 0
    while (g < len(recallR3)):
        somaRR3 = somaRR3 + recallR3[g]
        g = g + 1
    Rrt3 = (somaRR3 / len(recallR3)) * 100
    Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR3 = 0
    while (g < len(f1R3)):
        somaFR3 = somaFR3 + f1R3[g]
        g = g + 1
    Rft3 = (somaFR3 / len(f1R3)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp3 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL3 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL3 = 0
    while (g < len(precisaoL3)):
        somaPL3 = somaPL3 + precisaoL3[g]
        g = g + 1
    Lpt3 = (somaPL3 / len(precisaoL3)) * 100
    Lrp3 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL3 = 0
    while (g < len(recallL3)):
        somaRL3 = somaRL3 + recallL3[g]
        g = g + 1
    Lrt3 = (somaRL2 / len(recallL2)) * 100
    Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL3 = 0
    while (g < len(f1L3)):
        somaFL3 = somaFL3 + f1L3[g]
        g = g + 1
    Lft3 = (somaFL3 / len(f1L3)) * 100

    ############################ rodada 4
    #print "## RODADA 4 ##"

    training_set = baseDividida1 + baseDividida2 + baseDividida3
    testing_set = baseDividida4

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    MNBpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                             average=None) * 100
    precisaoMNB4 = sklearn.metrics.precision_score(testgold,
                                                   testclas,
                                                   average=None)
    g = 0
    somaPMNB4 = 0
    while (g < len(precisaoMNB4)):
        somaPMNB4 = somaPMNB4 + precisaoMNB4[g]
        g = g + 1
    MNBpt4 = (somaPMNB4 / len(precisaoMNB4)) * 100
    MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                           average=None)) * 100
    recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRMNB4 = 0
    while (g < len(recallMNB4)):
        somaRMNB4 = somaRMNB4 + recallMNB4[g]
        g = g + 1
    MNBrt4 = (somaRMNB4 / len(recallMNB4)) * 100
    MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFMNB4 = 0
    while (g < len(f1MNB4)):
        somaFMNB4 = somaFMNB4 + f1MNB4[g]
        g = g + 1
    MNBft4 = (somaFMNB4 / len(f1MNB4)) * 100
    '''
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
    print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100)
    '''
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    testclas = LogisticRegression_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Rpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoR4 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPR4 = 0
    while (g < len(precisaoR4)):
        somaPR4 = somaPR4 + precisaoR4[g]
        g = g + 1
    Rpt4 = (somaPR4 / len(precisaoR4)) * 100
    Rrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRR4 = 0
    while (g < len(recallR4)):
        somaRR4 = somaRR4 + recallR4[g]
        g = g + 1
    Rrt4 = (somaRR4 / len(recallR4)) * 100
    Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFR4 = 0
    while (g < len(f1R4)):
        somaFR4 = somaFR4 + f1R4[g]
        g = g + 1
    Rft4 = (somaFR4 / len(f1R4)) * 100
    '''
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    SGDClassifier_classifier.train(training_set)
    SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
    print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100)

    SVC_classifier = SklearnClassifier(SVC())
    SVC_classifier.train(training_set)
    SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set)
    print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100)
    '''
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    testclas = LinearSVC_classifier.classify_many(
        [fs for (fs, l) in testing_set])
    testgold = [l for (fs, l) in testing_set]
    Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas)
    La4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100
    Lpp4 = sklearn.metrics.precision_score(testgold, testclas,
                                           average=None) * 100
    precisaoL4 = sklearn.metrics.precision_score(testgold,
                                                 testclas,
                                                 average=None)
    g = 0
    somaPL4 = 0
    while (g < len(precisaoL4)):
        somaPL4 = somaPL4 + precisaoL4[g]
        g = g + 1
    Lpt4 = (somaPL4 / len(precisaoL4)) * 100
    Lrp4 = (sklearn.metrics.recall_score(testgold, testclas,
                                         average=None)) * 100
    recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None)
    g = 0
    somaRL4 = 0
    while (g < len(recallL4)):
        somaRL4 = somaRL4 + recallL4[g]
        g = g + 1
    Lrt4 = (somaRL4 / len(recallL4)) * 100
    Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None))
    f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None)
    g = 0
    somaFL4 = 0
    while (g < len(f1L4)):
        somaFL4 = somaFL4 + f1L4[g]
        g = g + 1
    Lft4 = (somaFL4 / len(f1L4)) * 100

    ################# medias
    #print "## MEDIA ##"

    #MULTINOMINAL
    MNBmc = (MNBmc1 + MNBmc2 + MNBmc3 + MNBmc4) / 4
    MNBa = (MNBa1 + MNBa2 + MNBa3 + MNBa4) / 4
    MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4])
    MNBpp = (MNBpp4 + MNBpp4 + MNBpp4 + MNBpp4) / 4
    MNBpt = (MNBpt1 + MNBpt2 + MNBpt3 + MNBpt4) / 4
    MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4])
    MNBrp = (MNBrp1 + MNBrp2 + MNBrp3 + MNBrp4) / 4
    MNBrt = (MNBrt1 + MNBrt2 + MNBrt3 + MNBrt4) / 4
    MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4])
    MNBfp = (MNBfp1 + MNBfp2 + MNBfp3 + MNBfp4) / 4
    MNBft = (MNBft1 + MNBft2 + MNBft3 + MNBft4) / 4
    MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4])
    MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4])
    '''
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_aspect('equal')
    plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean)
    plt.colorbar()
    plt.show()
    '''

    #REGRESSAO LINEAR
    Rmc = (Rmc1 + Rmc2 + Rmc3 + Rmc4) / 4
    Ra = (Ra1 + Ra2 + Ra3 + Ra4) / 4
    Ramax = max([Ra1, Ra2, Ra3, Ra4])
    Ramin = min([Ra1, Ra2, Ra3, Ra4])
    Rpp = (Rpp4 + Rpp4 + Rpp4 + Rpp4) / 4
    Rpt = (Rpt1 + Rpt2 + Rpt3 + Rpt4) / 4
    Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4])
    Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4])
    Rrp = (Rrp1 + Rrp2 + Rrp3 + Rrp4) / 4
    Rrt = (Rrt1 + Rrt2 + Rrt3 + Rrt4) / 4
    Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4])
    Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4])
    Rfp = (Rfp1 + Rfp2 + Rfp3 + Rfp4) / 4
    Rft = (Rft1 + Rft2 + Rft3 + Rft4) / 4
    Rfmax = max([Rft1, Rft2, Rft3, Rft4])
    Rfmin = min([Rft1, Rft2, Rft3, Rft4])

    #SVC LINEAR
    Lmc = (Lmc1 + Lmc2 + Lmc3 + Lmc4) / 4
    La = (La1 + La2 + La3 + La4) / 4
    Lamax = max([La1, La2, La3, La4])
    Lamin = min([La1, La2, La3, La4])
    Lpp = (Lpp4 + Lpp4 + Lpp4 + Lpp4) / 4
    Lpt = (Lpt1 + Lpt2 + Lpt3 + Lpt4) / 4
    Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4])
    Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4])
    Lrp = (Lrp1 + Lrp2 + Lrp3 + Lrp4) / 4
    Lrt = (Lrt1 + Lrt2 + Lrt3 + Lrt4) / 4
    Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4])
    Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4])
    Lfp = (Lfp1 + Lfp2 + Lfp3 + Lfp4) / 4
    Lft = (Lft1 + Lft2 + Lft3 + Lft4) / 4
    Lfmax = max([Lft1, Lft2, Lft3, Lft4])
    Lfmin = min([Lft1, Lft2, Lft3, Lft4])
    '''
    print "SVC Linear"
    print "Matriz de confusão: ", Lmc
    print "Acuracia: ", La
    print "Precisão parcial: ", Lpp
    print "Precisão total: ", Lpt
    print "Recall parcial: ", Lrp
    print "Recall total: ", Lrt
    print "F-medida parcial: ", Lfp
    print "F-medida total: ", Lft
    '''

    with open(path, mode='w') as csv_file:
        #writer = csv.writer(csv_file)
        csv_file.writelines('Algoritmo' + ';' + 'Multinominal Naïve-Bayes' +
                            '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(MNBa1) + ';' + str(MNBpp1) + ';' +
                            str(MNBpt1) + ';' + str(MNBrp1) + ';' +
                            str(MNBrt1) + ';' + str(MNBfp1) + ';' +
                            str(MNBft1) + '\n')
        csv_file.writelines('2;' + str(MNBa2) + ';' + str(MNBpp2) + ';' +
                            str(MNBpt2) + ';' + str(MNBrp2) + ';' +
                            str(MNBrt2) + ';' + str(MNBfp2) + ';' +
                            str(MNBft2) + '\n')
        csv_file.writelines('3;' + str(MNBa3) + ';' + str(MNBpp3) + ';' +
                            str(MNBpt3) + ';' + str(MNBrp3) + ';' +
                            str(MNBrt3) + ';' + str(MNBfp3) + ';' +
                            str(MNBft3) + '\n')
        csv_file.writelines('4;' + str(MNBa4) + ';' + str(MNBpp4) + ';' +
                            str(MNBpt4) + ';' + str(MNBrp4) + ';' +
                            str(MNBrt4) + ';' + str(MNBfp4) + ';' +
                            str(MNBft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(MNBa) + ';' + str(MNBpp) + ';' +
                            str(MNBpt) + ';' + str(MNBrp) + ';' + str(MNBrt) +
                            ';' + str(MNBfp) + ';' + str(MNBft) + '\n')
        csv_file.writelines('Máximo;' + str(MNBamax) + "" + ';' +
                            str(MNBpmax) + "" + ';' + str(MNBrmax) + "" + ';' +
                            str(MNBfmax) + '\n')
        csv_file.writelines('Mínimo;' + str(MNBamin) + "" + ';' +
                            str(MNBpmin) + "" + ';' + str(MNBrmin) + "" + ';' +
                            str(MNBfmin) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Algoritmo' + ';' + 'Regressão Linear' + '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(Ra1) + ';' + str(Rpp1) + ';' +
                            str(Rpt1) + ';' + str(Rrp1) + ';' + str(Rrt1) +
                            ';' + str(Rfp1) + ';' + str(Rft1) + '\n')
        csv_file.writelines('2;' + str(Ra2) + ';' + str(Rpp2) + ';' +
                            str(Rpt2) + ';' + str(Rrp2) + ';' + str(Rrt2) +
                            ';' + str(Rfp2) + ';' + str(Rft2) + '\n')
        csv_file.writelines('3;' + str(Ra3) + ';' + str(Rpp3) + ';' +
                            str(Rpt3) + ';' + str(Rrp3) + ';' + str(Rrt3) +
                            ';' + str(Rfp3) + ';' + str(Rft3) + '\n')
        csv_file.writelines('4;' + str(Ra4) + ';' + str(Rpp4) + ';' +
                            str(Rpt4) + ';' + str(Rrp4) + ';' + str(Rrt4) +
                            ';' + str(Rfp4) + ';' + str(Rft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(Ra) + ';' + str(Rpp) + ';' +
                            str(Rpt) + ';' + str(Rrp) + ';' + str(Rrt) + ';' +
                            str(Rfp) + ';' + str(Rft) + '\n')
        csv_file.writelines('Máximo;' + str(Ramax) + "" + ';' + str(Rpmax) +
                            "" + ';' + str(Rrmax) + "" + ';' + str(Rfmax) +
                            '\n')
        csv_file.writelines('Mínimo;' + str(Ramin) + "" + ';' + str(Rpmin) +
                            "" + ';' + str(Rrmin) + "" + ';' + str(Rfmin) +
                            '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Algoritmo' + ';' + 'SVC Linear' + '\n')
        csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' +
                            'Precisão parcial' + ';' + 'Precisão total' + ';' +
                            'revocação parcial' + ';' + 'revocação total' +
                            ';' + 'f-medida parcial' + ';' + 'f-medida total' +
                            '\n')
        csv_file.writelines('1;' + str(La1) + ';' + str(Lpp1) + ';' +
                            str(Lpt1) + ';' + str(Lrp1) + ';' + str(Lrt1) +
                            ';' + str(Lfp1) + ';' + str(Lft1) + '\n')
        csv_file.writelines('2;' + str(La2) + ';' + str(Lpp2) + ';' +
                            str(Lpt2) + ';' + str(Lrp2) + ';' + str(Lrt2) +
                            ';' + str(Lfp2) + ';' + str(Lft2) + '\n')
        csv_file.writelines('3;' + str(La3) + ';' + str(Lpp3) + ';' +
                            str(Lpt3) + ';' + str(Lrp3) + ';' + str(Lrt3) +
                            ';' + str(Lfp3) + ';' + str(Lft3) + '\n')
        csv_file.writelines('4;' + str(La4) + ';' + str(Lpp4) + ';' +
                            str(Lpt4) + ';' + str(Lrp4) + ';' + str(Lrt4) +
                            ';' + str(Lfp4) + ';' + str(Lft4) + '\n')
        csv_file.writelines('==================' + '\n')
        csv_file.writelines('Total' + '\n')
        csv_file.writelines('Média;' + str(La) + ';' + str(Lpp) + ';' +
                            str(Lpt) + ';' + str(Lrp) + ';' + str(Lrt) + ';' +
                            str(Lfp) + ';' + str(Lft) + '\n')
        csv_file.writelines('Máximo;' + str(Lamax) + "" + ';' + str(Lpmax) +
                            "" + ';' + str(Lrmax) + "" + ';' + str(Lfmax) +
                            '\n')
        csv_file.writelines('Mínimo;' + str(Lamin) + "" + ';' + str(Lpmin) +
                            "" + ';' + str(Lrmin) + "" + ';' + str(Lfmin) +
                            '\n')