def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ # TODO: prepare the data data = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL <s> and the END SYMBOL </s> for s in train_data: data.append(("<s>", s[0][1])) for i in range(len(s) - 1): data.append((s[i][1], s[i + 1][1])) data.append((s[len(s) - 1][1], "</s>")) # TODO compute the transition model transition_FD = ConditionalFreqDist(data) estimator = lambda f: nltk.LidstoneProbDist(f, 0.01, f.B() + 1) self.transition_PD = ConditionalProbDist(transition_FD, estimator) return self.transition_PD
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ #raise NotImplementedError('HMM.emission_model') # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for s in train_data: for (word, tag) in s: data.append((tag, word.lower())) # TODO compute the emission model emission_FD = nltk.ConditionalFreqDist(data) lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1) self.emission_PD = nltk.ConditionalProbDist(emission_FD, lidstone_estimator) self.states = list(set([ tag for (tag, word) in data])) return self.emission_PD, self.states
def transition_model(self, train_data): """ Compute an transition model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The transition probability distribution :rtype: ConditionalProbDist """ #raise NotImplementedError('HMM.transition_model') # TODO: prepare the data data = [] # The data object should be an array of tuples of conditions and observations, # in our case the tuples will be of the form (tag_(i),tag_(i+1)). # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s> padded_data = [] for s in train_data: padded_data.append([('<s>','<s>')] + s + [('</s>','</s>')]) # TODO tagGenerators=(((s[i][1],s[i+1][1]) for i in range(len(s)-1)) for s in padded_data) data = list(itertools.chain.from_iterable(tagGenerators)) # TODO compute the transition model transition_FD = nltk.ConditionalFreqDist(data) lidstone_estimator = lambda fd: nltk.LidstoneProbDist(fd, 0.01, fd.B() + 1) self.transition_PD = nltk.ConditionalProbDist(transition_FD, lidstone_estimator) return self.transition_PD
def train_word_lm_lidstone(dataset, n=2, gamma=0.01): lidstone_estimator = lambda fd: nltk.LidstoneProbDist( fd, gamma, fd.B() + 100) model = NgramModel(n, dataset, smoothing=True, estimator=lidstone_estimator) return model
def getLidstoneTag(self, tagged_corpus): """ Converts the provided frequency distribution into a Lidstone estimation using the nltk.probability module. Gamma=1, bins=None is Laplace. This estimation is used to decide which tag to assign given the tag- context observed in the test tagged corpus. Creates necessary fd from training tagged corpus. """ fd = self.getNGramTagFD(tagged_corpus) td_ngram = nltk.LidstoneProbDist(fd, self._gamma, bins=self._bins) return td_ngram
def generate(self, length=100): """""" # Change tokens self.tokens = nltk.word_tokenize( self.__words[randint(1, len(self.__words)) - 1]) estimator = lambda fdist, bins: nltk.LidstoneProbDist( fdist, self.__random.random()) #estimator = lambda fdist, bins: nltk.LidstoneProbDist(fdist, 0.2) self._trigram_model = nltk.NgramModel(self.__random.randint(3, 15), self, estimator) #self._trigram_model = nltk.NgramModel(3, self, estimator) text = self._trigram_model.generate(length) return nltk.tokenwrap(text)
def train(): print 'Training HMM...' # Use the first 1000 sentences from the 'news' category of the Brown corpus labelled_sequences, states, symbols = get_pos_data(1000) # Define the estimator to be used for probability computation estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins) # count occurences of starting states, transitions out of each state # and output symbols observed in each state freq_starts = nltk.FreqDist() freq_transitions = nltk.ConditionalFreqDist() freq_emissions = nltk.ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[1] symbol = token[0] if lasts == None: freq_starts.inc(state) else: freq_transitions[lasts].inc(state) freq_emissions[state].inc(symbol) lasts = state # update the state and symbol lists if state not in states: states.append(state) if symbol not in symbols: symbols.append(symbol) # create probability distributions (with smoothing) N = len(states) starts = estimator(freq_starts, N) transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N) emissions = nltk.ConditionalProbDist(freq_emissions, estimator, len(symbols)) # Return the transition and emissions probabilities along with # the list of all the states and output symbols return starts, transitions, emissions, states, symbols
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # TODO prepare data data = [] # Don't forget to lowercase the observation otherwise it mismatches the test data for sent in train_data: data += [(tag, word.lower()) for (word, tag) in sent] # TODO compute the emission model emission_FD = ConditionalFreqDist(data) estimator = lambda f: nltk.LidstoneProbDist(f, 0.01, f.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, estimator) self.states = [s for s in self.emission_PD.keys()] return self.emission_PD, self.states
def word_suavizacao(all_words): # all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words)) all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.01) return list(all_words.samples())
def LidProDist(freqDist): return nltk.LidstoneProbDist(freqDist, 0.01, freqDist.B() + 1)
def LidstoneProbDistFactory(freqdist): return nltk.LidstoneProbDist(freqdist, .01, freqdist.B() + 1)
for sent in brown_tags: sub_temp = [] for tag in sent: try: sub_temp.append(nltk.tag.simplify_tag(tag)) except IndexError: pass temp.append(sub_temp) fd = nltk.FreqDist() for sent in temp: if len(sent)>2: temp_tri = nltk.trigrams(sent) for entry in temp_tri: fd.inc(entry) tfd = nltk.LidstoneProbDist(fd, gamma=1) tfd_avg = sum(tfd.prob(sample) for sample in \ tfd.samples()) / float(len(tfd.samples())) raw = urlopen(url).read() raw = nltk.clean_html(raw) raw = raw.replace('\r\n', '') raw = raw.replace('\n', '') raw = raw.replace('\\', '') raw = raw.lower() for key in contraction_dict.keys(): raw = raw.replace(key, contraction_dict[key]) sents_tok = nltk.sent_tokenize(raw) matchVec = [] for sent in sents_tok:
def makeNGrams(self, k=0, print_out=True): self.grams = nltk.ngrams(self.corpus, self.n_gram) gfreq = nltk.FreqDist(self.grams) self.pdf = nltk.LidstoneProbDist(gfreq, k) print("N-grams ready!")
def executar(experimento,nome_Base,acento): nomeBase = nome_Base path = experimento+nomeBase # print('executando:\n'+path) # print('Sem acento:\n'+('Sim' if(acento) else 'Não')) base = readBase(nomeBase) tamBase = len(base) i=0 documents = [] #print base[0][0].split() tknzr = nltk.tokenize.TweetTokenizer() while (i<tamBase): if(acento): w = [q.lower() for q in remocaoacento(tknzr.tokenize(base[i][0]))] else: w = [q.lower() for q in tknzr.tokenize(base[i][0])] w = remocaopontos(w) conteudoLista = (w,base[i][1]) documents.append(conteudoLista) i += 1 ################################ Pre Processamento stopwords = nltk.corpus.stopwords.words('portuguese') stemmer = nltk.stem.RSLPStemmer() # h=0 # j=len(documents) # while (h<j): # g=len(documents[h][0]) # f=0 # while(f<g): # stemmer.stem(documents[h][0][f]) # f+=1 # h += 1 ################################ random.shuffle(documents) all_words = [] k=0 l=len(documents) while (k<l): m=len(documents[k][0]) n=0 while(n<m): all_words.append(documents[k][0][n]) n+=1 k += 1 # all_words = remocaopontos(all_words) all_words = [w.lower() for w in all_words if w not in stopwords] # print(str(all_words)) #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words)) #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words)) #all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1) #all_words = nltk.WittenBellProbDist(nltk.FreqDist(all_words)) #nltk.WittenBellProbDist() procurar como mudar o ngram #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words)) #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)) all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1) #all_words = nltk.FreqDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)) word_features = list(all_words.samples()) #se usando FreqDistlista com palavras que aparecem mais de 3000 # word_features =nltk.LidstoneProbDist(nltk.FreqDist(word_features), 0.1) # word_features = word_features.samples() #word_features = list(all_words.keys()) '''aqui que modifiquei def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features ''' #aquii def wordbigram(word_feature): bigram =[] i=0 l = len(word_feature)-1 while (i<l): # if ((not word_feature[i] in stopwords) or (not word_feature[i+1]in stopwords)): s = tuple([stemmer.stem(word_feature[i]),stemmer.stem(word_feature[i+1])]) bigram.append(s) i+=1 return bigram def removerpalavras(todas_palavras,document): #remover as palavras que não estãoem todas as palavras linha = [] for w in document: if(w in todas_palavras): linha.append(w) return linha def wordFeature(documents): #cria um dicionario de dados dicionario = [] for w in documents: for q in w[0]: if(not q in dicionario): dicionario.append(q) return dicionario documents = [[removerpalavras(all_words.samples(),w[0]),w[1]] for w in documents] documents = [[wordbigram(w[0]),w[1]] for w in documents] word_features = wordFeature(documents) #se 0usando FreqDistlista com palavras que aparecem mais de 3000 # print(str(len(word_features))) # exit() # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000 def find_features(document): # words = set(document) features = {} i=0 l = len(word_features) while(i<l): features[str(i)] = (word_features[i] in document) i+=1 return features featuresets = [(find_features(rev), category) for (rev, category) in documents] kfold = 4 baseInteira = featuresets tamT = len(featuresets) divisao = tamT//kfold ###### ajustar divisao baseDividida1 = featuresets[0:divisao] baseDividida2 = featuresets[divisao:(divisao*2)] baseDividida3 = featuresets[(divisao*2):(divisao*3)] baseDividida4 = featuresets[(divisao*3):tamT] #tamT = len(featuresets) #umQuarto = tamBase/4 #training_set = featuresets[umQuarto:] #testing_set = featuresets[:umQuarto] #training_set = featuresets[100:] #testing_set = featuresets[0:100] ########################## 1 rodada #print "## RODADA 1 ##" training_set = baseDividida2+baseDividida3+baseDividida4 testing_set = baseDividida1 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 MNBpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoMNB1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPMNB1 = 0 while(g<len(precisaoMNB1)): somaPMNB1 = somaPMNB1+precisaoMNB1[g] g=g+1 MNBpt1 = (somaPMNB1/len(precisaoMNB1))*100 MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRMNB1 = 0 while(g<len(recallMNB1)): somaRMNB1 = somaRMNB1+recallMNB1[g] g=g+1 MNBrt1 = (somaRMNB1/len(recallMNB1))*100 MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFMNB1 = 0 while(g<len(f1MNB1)): somaFMNB1 = somaFMNB1+f1MNB1[g] g=g+1 MNBft1 = (somaFMNB1/len(f1MNB1))*100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Rpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoR1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPR1 = 0 while(g<len(precisaoR1)): somaPR1 = somaPR1+precisaoR1[g] g=g+1 Rpt1 = (somaPR1/len(precisaoR1))*100 Rrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRR1 = 0 while(g<len(recallR1)): somaRR1 = somaRR1+recallR1[g] g=g+1 Rrt1 = (somaRR1/len(recallR1))*100 Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFR1 = 0 while(g<len(f1R1)): somaFR1 = somaFR1+f1R1[g] g=g+1 Rft1 = (somaFR1/len(f1R1))*100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) La1 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Lpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoL1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPL1 = 0 while(g<len(precisaoL1)): somaPL1 = somaPL1+precisaoL1[g] g=g+1 Lpt1 = (somaPL1/len(precisaoL1))*100 Lrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRL1 = 0 while(g<len(recallL1)): somaRL1 = somaRL1+recallL1[g] g=g+1 Lrt1 = (somaRL1/len(recallL1))*100 Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFL1 = 0 while(g<len(f1L1)): somaFL1 = somaFL1+f1L1[g] g=g+1 Lft1 = (somaFL1/len(f1L1))*100 ######################## Rodada 2 #print "## RODADA 2 ##" training_set = baseDividida1+baseDividida3+baseDividida4 testing_set = baseDividida2 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 MNBpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoMNB2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPMNB2 = 0 while(g<len(precisaoMNB2)): somaPMNB2 = somaPMNB2+precisaoMNB2[g] g=g+1 MNBpt2 = (somaPMNB2/len(precisaoMNB2))*100 MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRMNB2 = 0 while(g<len(recallMNB2)): somaRMNB2 = somaRMNB2+recallMNB2[g] g=g+1 MNBrt2 = (somaRMNB2/len(recallMNB2))*100 MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFMNB2 = 0 while(g<len(f1MNB2)): somaFMNB2 = somaFMNB2+f1MNB2[g] g=g+1 MNBft2 = (somaFMNB2/len(f1MNB2))*100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Rpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoR2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPR2 = 0 while(g<len(precisaoR2)): somaPR2 = somaPR2+precisaoR2[g] g=g+1 Rpt2 = (somaPR2/len(precisaoR2))*100 Rrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRR2 = 0 while(g<len(recallR2)): somaRR2 = somaRR2+recallR2[g] g=g+1 Rrt2 = (somaRR2/len(recallR2))*100 Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFR2 = 0 while(g<len(f1R2)): somaFR2 = somaFR2+f1R2[g] g=g+1 Rft2 = (somaFR2/len(f1R2))*100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) La2 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Lpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoL2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPL2 = 0 while(g<len(precisaoL2)): somaPL2 = somaPL2+precisaoL2[g] g=g+1 Lpt2 = (somaPL2/len(precisaoL2))*100 Lrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRL2 = 0 while(g<len(recallL2)): somaRL2 = somaRL2+recallL2[g] g=g+1 Lrt2 = (somaRL2/len(recallL2))*100 Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFL2 = 0 while(g<len(f1L2)): somaFL2 = somaFL2+f1L2[g] g=g+1 Lft2 = (somaFL2/len(f1L2))*100 ##################### rodada 3 #print "## RODADA 3 ##" training_set = baseDividida1+baseDividida2+baseDividida4 testing_set = baseDividida3 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 MNBpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoMNB3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPMNB3 = 0 while(g<len(precisaoMNB3)): somaPMNB3 = somaPMNB3+precisaoMNB3[g] g=g+1 MNBpt3 = (somaPMNB3/len(precisaoMNB3))*100 MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRMNB3 = 0 while(g<len(recallMNB3)): somaRMNB3 = somaRMNB3+recallMNB3[g] g=g+1 MNBrt3 = (somaRMNB3/len(recallMNB3))*100 MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFMNB3 = 0 while(g<len(f1MNB3)): somaFMNB3 = somaFMNB3+f1MNB3[g] g=g+1 MNBft3 = (somaFMNB3/len(f1MNB3))*100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Rpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoR3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPR3 = 0 while(g<len(precisaoR3)): somaPR3 = somaPR3+precisaoR3[g] g=g+1 Rpt3 = (somaPR3/len(precisaoR3))*100 Rrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRR3 = 0 while(g<len(recallR3)): somaRR3 = somaRR3+recallR3[g] g=g+1 Rrt3 = (somaRR3/len(recallR3))*100 Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFR3 = 0 while(g<len(f1R3)): somaFR3 = somaFR3+f1R3[g] g=g+1 Rft3 = (somaFR3/len(f1R3))*100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) La3 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Lpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoL3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPL3 = 0 while(g<len(precisaoL3)): somaPL3 = somaPL3+precisaoL3[g] g=g+1 Lpt3 = (somaPL3/len(precisaoL3))*100 Lrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRL3 = 0 while(g<len(recallL3)): somaRL3 = somaRL3+recallL3[g] g=g+1 Lrt3 = (somaRL2/len(recallL2))*100 Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFL3 = 0 while(g<len(f1L3)): somaFL3 = somaFL3+f1L3[g] g=g+1 Lft3 = (somaFL3/len(f1L3))*100 ############################ rodada 4 #print "## RODADA 4 ##" training_set = baseDividida1+baseDividida2+baseDividida3 testing_set = baseDividida4 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 MNBpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoMNB4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPMNB4 = 0 while(g<len(precisaoMNB4)): somaPMNB4 = somaPMNB4+precisaoMNB4[g] g=g+1 MNBpt4 = (somaPMNB4/len(precisaoMNB4))*100 MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRMNB4 = 0 while(g<len(recallMNB4)): somaRMNB4 = somaRMNB4+recallMNB4[g] g=g+1 MNBrt4 = (somaRMNB4/len(recallMNB4))*100 MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFMNB4 = 0 while(g<len(f1MNB4)): somaFMNB4 = somaFMNB4+f1MNB4[g] g=g+1 MNBft4 = (somaFMNB4/len(f1MNB4))*100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Rpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoR4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPR4 = 0 while(g<len(precisaoR4)): somaPR4 = somaPR4+precisaoR4[g] g=g+1 Rpt4 = (somaPR4/len(precisaoR4))*100 Rrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRR4 = 0 while(g<len(recallR4)): somaRR4 = somaRR4+recallR4[g] g=g+1 Rrt4 = (somaRR4/len(recallR4))*100 Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFR4 = 0 while(g<len(f1R4)): somaFR4 = somaFR4+f1R4[g] g=g+1 Rft4 = (somaFR4/len(f1R4))*100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) La4 = (sklearn.metrics.accuracy_score(testgold, testclas))*100 Lpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None)*100 precisaoL4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g=0 somaPL4 = 0 while(g<len(precisaoL4)): somaPL4 = somaPL4+precisaoL4[g] g=g+1 Lpt4 = (somaPL4/len(precisaoL4))*100 Lrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None))*100 recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g=0 somaRL4 = 0 while(g<len(recallL4)): somaRL4 = somaRL4+recallL4[g] g=g+1 Lrt4 = (somaRL4/len(recallL4))*100 Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g=0 somaFL4 = 0 while(g<len(f1L4)): somaFL4 = somaFL4+f1L4[g] g=g+1 Lft4 = (somaFL4/len(f1L4))*100 ################# medias #print "## MEDIA ##" #MULTINOMINAL MNBmc = (MNBmc1+MNBmc2+MNBmc3+MNBmc4)/4 MNBa = (MNBa1+MNBa2+MNBa3+MNBa4)/4 MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4]) MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4]) MNBpp = (MNBpp4+MNBpp4+MNBpp4+MNBpp4)/4 MNBpt = (MNBpt1+MNBpt2+MNBpt3+MNBpt4)/4 MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBrp = (MNBrp1+MNBrp2+MNBrp3+MNBrp4)/4 MNBrt = (MNBrt1+MNBrt2+MNBrt3+MNBrt4)/4 MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBfp = (MNBfp1+MNBfp2+MNBfp3+MNBfp4)/4 MNBft = (MNBft1+MNBft2+MNBft3+MNBft4)/4 MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4]) MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4]) ''' fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.set_aspect('equal') plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean) plt.colorbar() plt.show() ''' #REGRESSAO LINEAR Rmc = (Rmc1+Rmc2+Rmc3+Rmc4)/4 Ra = (Ra1+Ra2+Ra3+Ra4)/4 Ramax = max([Ra1, Ra2, Ra3, Ra4]) Ramin = min([Ra1, Ra2, Ra3, Ra4]) Rpp = (Rpp4+Rpp4+Rpp4+Rpp4)/4 Rpt = (Rpt1+Rpt2+Rpt3+Rpt4)/4 Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4]) Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4]) Rrp = (Rrp1+Rrp2+Rrp3+Rrp4)/4 Rrt = (Rrt1+Rrt2+Rrt3+Rrt4)/4 Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4]) Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4]) Rfp = (Rfp1+Rfp2+Rfp3+Rfp4)/4 Rft = (Rft1+Rft2+Rft3+Rft4)/4 Rfmax = max([Rft1, Rft2, Rft3, Rft4]) Rfmin = min([Rft1, Rft2, Rft3, Rft4]) #SVC LINEAR Lmc = (Lmc1+Lmc2+Lmc3+Lmc4)/4 La = (La1+La2+La3+La4)/4 Lamax = max([La1, La2, La3, La4]) Lamin = min([La1, La2, La3, La4]) Lpp = (Lpp4+Lpp4+Lpp4+Lpp4)/4 Lpt = (Lpt1+Lpt2+Lpt3+Lpt4)/4 Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4]) Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4]) Lrp = (Lrp1+Lrp2+Lrp3+Lrp4)/4 Lrt = (Lrt1+Lrt2+Lrt3+Lrt4)/4 Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4]) Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4]) Lfp = (Lfp1+Lfp2+Lfp3+Lfp4)/4 Lft = (Lft1+Lft2+Lft3+Lft4)/4 Lfmax = max([Lft1, Lft2, Lft3, Lft4]) Lfmin = min([Lft1, Lft2, Lft3, Lft4]) ''' print "SVC Linear" print "Matriz de confusão: ", Lmc print "Acuracia: ", La print "Precisão parcial: ", Lpp print "Precisão total: ", Lpt print "Recall parcial: ", Lrp print "Recall total: ", Lrt print "F-medida parcial: ", Lfp print "F-medida total: ", Lft ''' print(experimento + ':' + str(MNBa) +'\t'+str(Ra)+'\t'+str(La)) with open(path,mode='w') as csv_file: #writer = csv.writer(csv_file) csv_file.writelines('Algoritmo'+';'+'Multinominal Naïve-Bayes'+'\n') csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n') csv_file.writelines('1;'+ str(MNBa1)+';'+str(MNBpp1)+';'+str(MNBpt1)+';'+str(MNBrp1)+';'+str(MNBrt1)+';'+str(MNBfp1)+';'+str(MNBft1)+'\n') csv_file.writelines('2;'+ str(MNBa2)+';'+str(MNBpp2)+';'+str(MNBpt2)+';'+str(MNBrp2)+';'+str(MNBrt2)+';'+str(MNBfp2)+';'+str(MNBft2)+'\n') csv_file.writelines('3;'+ str(MNBa3)+';'+str(MNBpp3)+';'+str(MNBpt3)+';'+str(MNBrp3)+';'+str(MNBrt3)+';'+str(MNBfp3)+';'+str(MNBft3)+'\n') csv_file.writelines('4;'+ str(MNBa4)+';'+str(MNBpp4)+';'+str(MNBpt4)+';'+str(MNBrp4)+';'+str(MNBrt4)+';'+str(MNBfp4)+';'+str(MNBft4)+'\n') csv_file.writelines('=================='+'\n') csv_file.writelines('Total'+'\n') csv_file.writelines('Média;'+ str(MNBa)+';'+str(MNBpp)+';'+str(MNBpt)+';'+str(MNBrp)+';'+str(MNBrt)+';'+str(MNBfp)+';'+str(MNBft)+'\n') csv_file.writelines('Máximo;'+ str(MNBamax)+""+';'+str(MNBpmax)+""+';'+str(MNBrmax)+""+';'+str(MNBfmax)+'\n') csv_file.writelines('Mínimo;'+ str(MNBamin)+""+';'+str(MNBpmin)+""+';'+str(MNBrmin)+""+';'+str(MNBfmin)+'\n') csv_file.writelines('=================='+'\n') csv_file.writelines('Algoritmo'+';'+'Regressão Linear'+'\n') csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n') csv_file.writelines('1;'+ str(Ra1)+';'+str(Rpp1)+';'+str(Rpt1)+';'+str(Rrp1)+';'+str(Rrt1)+';'+str(Rfp1)+';'+str(Rft1)+'\n') csv_file.writelines('2;'+ str(Ra2)+';'+str(Rpp2)+';'+str(Rpt2)+';'+str(Rrp2)+';'+str(Rrt2)+';'+str(Rfp2)+';'+str(Rft2)+'\n') csv_file.writelines('3;'+ str(Ra3)+';'+str(Rpp3)+';'+str(Rpt3)+';'+str(Rrp3)+';'+str(Rrt3)+';'+str(Rfp3)+';'+str(Rft3)+'\n') csv_file.writelines('4;'+ str(Ra4)+';'+str(Rpp4)+';'+str(Rpt4)+';'+str(Rrp4)+';'+str(Rrt4)+';'+str(Rfp4)+';'+str(Rft4)+'\n') csv_file.writelines('=================='+'\n') csv_file.writelines('Total'+'\n') csv_file.writelines('Média;'+ str(Ra)+';'+str(Rpp)+';'+str(Rpt)+';'+str(Rrp)+';'+str(Rrt)+';'+str(Rfp)+';'+str(Rft)+'\n') csv_file.writelines('Máximo;'+ str(Ramax)+""+';'+str(Rpmax)+""+';'+str(Rrmax)+""+';'+str(Rfmax)+'\n') csv_file.writelines('Mínimo;'+ str(Ramin)+""+';'+str(Rpmin)+""+';'+str(Rrmin)+""+';'+str(Rfmin)+'\n') csv_file.writelines('=================='+'\n') csv_file.writelines('Algoritmo'+';'+'SVC Linear'+'\n') csv_file.writelines('Iteração'+';'+'Acurácia'+';'+'Precisão parcial'+';'+'Precisão total'+';'+'revocação parcial'+';'+'revocação total'+';'+'f-medida parcial'+';'+'f-medida total'+'\n') csv_file.writelines('1;'+ str(La1)+';'+str(Lpp1)+';'+str(Lpt1)+';'+str(Lrp1)+';'+str(Lrt1)+';'+str(Lfp1)+';'+str(Lft1)+'\n') csv_file.writelines('2;'+ str(La2)+';'+str(Lpp2)+';'+str(Lpt2)+';'+str(Lrp2)+';'+str(Lrt2)+';'+str(Lfp2)+';'+str(Lft2)+'\n') csv_file.writelines('3;'+ str(La3)+';'+str(Lpp3)+';'+str(Lpt3)+';'+str(Lrp3)+';'+str(Lrt3)+';'+str(Lfp3)+';'+str(Lft3)+'\n') csv_file.writelines('4;'+ str(La4)+';'+str(Lpp4)+';'+str(Lpt4)+';'+str(Lrp4)+';'+str(Lrt4)+';'+str(Lfp4)+';'+str(Lft4)+'\n') csv_file.writelines('=================='+'\n') csv_file.writelines('Total'+'\n') csv_file.writelines('Média;'+ str(La)+';'+str(Lpp)+';'+str(Lpt)+';'+str(Lrp)+';'+str(Lrt)+';'+str(Lfp)+';'+str(Lft)+'\n') csv_file.writelines('Máximo;'+ str(Lamax)+""+';'+str(Lpmax)+""+';'+str(Lrmax)+""+';'+str(Lfmax)+'\n') csv_file.writelines('Mínimo;'+ str(Lamin)+""+';'+str(Lpmin)+""+';'+str(Lrmin)+""+';'+str(Lfmin)+'\n')
def executar(experimento, nome_Base, acento): ''' nomeBase = nome_Base path = experimento+nomeBase print('executando:\n'+path) print('Sem acento:\n'+('Sim' if(acento) else 'Não')) base = readBase(nomeBase) tamBase = len(base) ''' base = [ (' Vejam bem, maioria dos homens apoiam. maioria de mulheres não. Homens tendem a serem lógicos, mulheres emotivas. Sera que mulheres acham que rodeios é judiação de animais? O que é claramente equivocado da parte delas. Como também isso se repete na PL3722, homens pensam logicamente, e são mais favorável ao cidadão ter menos restrições a armas de fogo para proteger seu patrimônio e sua família. Ja as mães, familiares, namoradas(os), principalmente de bandidos, acham isso um terror, por que qual mãe quer ver o filho bandido morto praticando um assalto? Logo são contra o direito das vitimas de si proteger do seu filho bandido. Vote consciente, vote com razão e não emoção! Brasil melhora rapidinho. ', 1), ('Observando daquí, a debandada dos derrotados. Cadê o Sacoman Keffeyo, a Ana Animais e, o pilantra do Haroldo Girafales? Perderam a coragem de virem aquí na enquete, questionar a sanção do PL? Corvardões perdedores: vão chorar no colinho da Luisa Mell. ', 1), ('Dezenas de debates e ficou mais que provado que animais atletas nao sao animais maus tratados,parabens capitao augusto. ', 1), ('PARABÉNS CAPITÃO AUGUSTO ISSO PROVA QUE QUEM AMA CUIDA,ANIMAIS TRATADOS COM MUITO CARINHO ', 1), ('Parabéns Capitão Augusto,agora é lei.', 1) ] tamBase = len(base) i = 0 documents = [] #print base[0][0].split() tknzr = nltk.tokenize.TweetTokenizer() while (i < tamBase): if (acento): w = remocaoacento(tknzr.tokenize(base[i][0])) else: w = tknzr.tokenize(base[i][0]) w = remocaopontos(w) conteudoLista = (w, base[i][1]) documents.append(conteudoLista) i += 1 ################################ Pre Processamento stopwords = nltk.corpus.stopwords.words('portuguese') # stopwords = ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', # 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', # 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', # 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', # 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', # 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', # 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', # 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', # 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', # 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', # 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', # 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', # 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', # 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', # 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', # 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', # 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', # 'teremos', 'terão', 'teria', 'teríamos', 'teriam'] # stemmer = nltk.stem.RSLPStemmer() # h=0 # j=len(documents) # while (h<j): # g=len(documents[h][0]) # f=0 # while(f<g): # stemmer.stem(documents[h][0][f]) # f+=1 # h += 1 ################################ random.shuffle(documents) all_words = [] k = 0 l = len(documents) while (k < l): m = len(documents[k][0]) n = 0 while (n < m): all_words.append(documents[k][0][n]) n += 1 k += 1 # all_words = remocaopontos(all_words) all_words = [w.lower() for w in all_words if w not in stopwords] # print(str(all_words)) #all_words = nltk.FreqDist(all_words) #calcula frequencia de palavras, definir o limite de palavras #all_words = nltk.LaplaceProbDist(nltk.FreqDist(all_words)) #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(all_words)) #all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1) #all_words = nltk.WittenBellProbDist(nltk.FreqDist(all_words)) #nltk.WittenBellProbDist() procurar como mudar o ngram #all_words = nltk.MLEProbDist(nltk.FreqDist(all_words)) #all_words = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)) all_words = nltk.LidstoneProbDist(nltk.FreqDist(all_words), 0.1) #all_words = nltk.FreqDist(nltk.FreqDist(w.lower() for w in all_words if w not in stopwords)) word_features = list(all_words.samples( )) #se usando FreqDistlista com palavras que aparecem mais de 3000 #word_features = list(all_words.keys()) '''aqui que modifiquei def find_features(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return features ''' #aquii def wordbig(word_feature): words = [] i = 0 l = len(word_feature) - 1 while (i < l): words.append(tuple([word_feature[i], word_feature[i + 1]])) i += 1 return words def removerpalavras(todas_palavras, document): #remover as palavras que não estãoem todas as palavras linha = [] for w in document: if (w in todas_palavras): linha.append(w) return linha def wordFeature(documents): #cria um dicionario de dados dicionario = [] for w in documents: for q in w[0]: if (not q in dicionario): dicionario.append(q) return dicionario documents = [[removerpalavras(all_words.samples(), w[0]), w[1]] for w in documents] documents = [[wordbig(w[0]), w[1]] for w in documents] word_features = wordFeature( documents ) #se 0usando FreqDistlista com palavras que aparecem mais de 3000 # print(str(len(word_features))) # exit() # word_features = list(all_words.samples())#se 0usando FreqDistlista com palavras que aparecem mais de 3000 def find_features(document): # words = set(document) features = {} i = 0 l = len(word_features) while (i < l): features[str(word_features[i])] = (word_features[i] in document) i += 1 print(str(document)) print() print(str(features)) exit() return features featuresets = [(find_features(rev), category) for (rev, category) in documents] kfold = 4 baseInteira = featuresets tamT = len(featuresets) divisao = tamT // kfold ###### ajustar divisao baseDividida1 = featuresets[0:divisao] baseDividida2 = featuresets[divisao:(divisao * 2)] baseDividida3 = featuresets[(divisao * 2):(divisao * 3)] baseDividida4 = featuresets[(divisao * 3):tamT] #tamT = len(featuresets) #umQuarto = tamBase/4 #training_set = featuresets[umQuarto:] #testing_set = featuresets[:umQuarto] #training_set = featuresets[100:] #testing_set = featuresets[0:100] ########################## 1 rodada #print "## RODADA 1 ##" training_set = baseDividida2 + baseDividida3 + baseDividida4 testing_set = baseDividida1 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB1 = 0 while (g < len(precisaoMNB1)): somaPMNB1 = somaPMNB1 + precisaoMNB1[g] g = g + 1 MNBpt1 = (somaPMNB1 / len(precisaoMNB1)) * 100 MNBrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB1 = 0 while (g < len(recallMNB1)): somaRMNB1 = somaRMNB1 + recallMNB1[g] g = g + 1 MNBrt1 = (somaRMNB1 / len(recallMNB1)) * 100 MNBfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB1 = 0 while (g < len(f1MNB1)): somaFMNB1 = somaFMNB1 + f1MNB1[g] g = g + 1 MNBft1 = (somaFMNB1 / len(f1MNB1)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR1 = 0 while (g < len(precisaoR1)): somaPR1 = somaPR1 + precisaoR1[g] g = g + 1 Rpt1 = (somaPR1 / len(precisaoR1)) * 100 Rrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR1 = 0 while (g < len(recallR1)): somaRR1 = somaRR1 + recallR1[g] g = g + 1 Rrt1 = (somaRR1 / len(recallR1)) * 100 Rfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR1 = 0 while (g < len(f1R1)): somaFR1 = somaFR1 + f1R1[g] g = g + 1 Rft1 = (somaFR1 / len(f1R1)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc1 = sklearn.metrics.confusion_matrix(testgold, testclas) La1 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp1 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL1 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL1 = 0 while (g < len(precisaoL1)): somaPL1 = somaPL1 + precisaoL1[g] g = g + 1 Lpt1 = (somaPL1 / len(precisaoL1)) * 100 Lrp1 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL1 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL1 = 0 while (g < len(recallL1)): somaRL1 = somaRL1 + recallL1[g] g = g + 1 Lrt1 = (somaRL1 / len(recallL1)) * 100 Lfp1 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L1 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL1 = 0 while (g < len(f1L1)): somaFL1 = somaFL1 + f1L1[g] g = g + 1 Lft1 = (somaFL1 / len(f1L1)) * 100 ######################## Rodada 2 #print "## RODADA 2 ##" training_set = baseDividida1 + baseDividida3 + baseDividida4 testing_set = baseDividida2 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB2 = 0 while (g < len(precisaoMNB2)): somaPMNB2 = somaPMNB2 + precisaoMNB2[g] g = g + 1 MNBpt2 = (somaPMNB2 / len(precisaoMNB2)) * 100 MNBrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB2 = 0 while (g < len(recallMNB2)): somaRMNB2 = somaRMNB2 + recallMNB2[g] g = g + 1 MNBrt2 = (somaRMNB2 / len(recallMNB2)) * 100 MNBfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB2 = 0 while (g < len(f1MNB2)): somaFMNB2 = somaFMNB2 + f1MNB2[g] g = g + 1 MNBft2 = (somaFMNB2 / len(f1MNB2)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR2 = 0 while (g < len(precisaoR2)): somaPR2 = somaPR2 + precisaoR2[g] g = g + 1 Rpt2 = (somaPR2 / len(precisaoR2)) * 100 Rrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR2 = 0 while (g < len(recallR2)): somaRR2 = somaRR2 + recallR2[g] g = g + 1 Rrt2 = (somaRR2 / len(recallR2)) * 100 Rfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR2 = 0 while (g < len(f1R2)): somaFR2 = somaFR2 + f1R2[g] g = g + 1 Rft2 = (somaFR2 / len(f1R2)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc2 = sklearn.metrics.confusion_matrix(testgold, testclas) La2 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp2 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL2 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL2 = 0 while (g < len(precisaoL2)): somaPL2 = somaPL2 + precisaoL2[g] g = g + 1 Lpt2 = (somaPL2 / len(precisaoL2)) * 100 Lrp2 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL2 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL2 = 0 while (g < len(recallL2)): somaRL2 = somaRL2 + recallL2[g] g = g + 1 Lrt2 = (somaRL2 / len(recallL2)) * 100 Lfp2 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L2 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL2 = 0 while (g < len(f1L2)): somaFL2 = somaFL2 + f1L2[g] g = g + 1 Lft2 = (somaFL2 / len(f1L2)) * 100 ##################### rodada 3 #print "## RODADA 3 ##" training_set = baseDividida1 + baseDividida2 + baseDividida4 testing_set = baseDividida3 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB3 = 0 while (g < len(precisaoMNB3)): somaPMNB3 = somaPMNB3 + precisaoMNB3[g] g = g + 1 MNBpt3 = (somaPMNB3 / len(precisaoMNB3)) * 100 MNBrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB3 = 0 while (g < len(recallMNB3)): somaRMNB3 = somaRMNB3 + recallMNB3[g] g = g + 1 MNBrt3 = (somaRMNB3 / len(recallMNB3)) * 100 MNBfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB3 = 0 while (g < len(f1MNB3)): somaFMNB3 = somaFMNB3 + f1MNB3[g] g = g + 1 MNBft3 = (somaFMNB3 / len(f1MNB3)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR3 = 0 while (g < len(precisaoR3)): somaPR3 = somaPR3 + precisaoR3[g] g = g + 1 Rpt3 = (somaPR3 / len(precisaoR3)) * 100 Rrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR3 = 0 while (g < len(recallR3)): somaRR3 = somaRR3 + recallR3[g] g = g + 1 Rrt3 = (somaRR3 / len(recallR3)) * 100 Rfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR3 = 0 while (g < len(f1R3)): somaFR3 = somaFR3 + f1R3[g] g = g + 1 Rft3 = (somaFR3 / len(f1R3)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc3 = sklearn.metrics.confusion_matrix(testgold, testclas) La3 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp3 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL3 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL3 = 0 while (g < len(precisaoL3)): somaPL3 = somaPL3 + precisaoL3[g] g = g + 1 Lpt3 = (somaPL3 / len(precisaoL3)) * 100 Lrp3 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL3 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL3 = 0 while (g < len(recallL3)): somaRL3 = somaRL3 + recallL3[g] g = g + 1 Lrt3 = (somaRL2 / len(recallL2)) * 100 Lfp3 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L3 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL3 = 0 while (g < len(f1L3)): somaFL3 = somaFL3 + f1L3[g] g = g + 1 Lft3 = (somaFL3 / len(f1L3)) * 100 ############################ rodada 4 #print "## RODADA 4 ##" training_set = baseDividida1 + baseDividida2 + baseDividida3 testing_set = baseDividida4 MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) testclas = MNB_classifier.classify_many([fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] MNBmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) MNBa4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 MNBpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoMNB4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPMNB4 = 0 while (g < len(precisaoMNB4)): somaPMNB4 = somaPMNB4 + precisaoMNB4[g] g = g + 1 MNBpt4 = (somaPMNB4 / len(precisaoMNB4)) * 100 MNBrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallMNB4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRMNB4 = 0 while (g < len(recallMNB4)): somaRMNB4 = somaRMNB4 + recallMNB4[g] g = g + 1 MNBrt4 = (somaRMNB4 / len(recallMNB4)) * 100 MNBfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1MNB4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFMNB4 = 0 while (g < len(f1MNB4)): somaFMNB4 = somaFMNB4 + f1MNB4[g] g = g + 1 MNBft4 = (somaFMNB4 / len(f1MNB4)) * 100 ''' BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierRodada2 = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", BernoulliNB_classifierRodada2*100) ''' LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) testclas = LogisticRegression_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Rmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) Ra4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Rpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoR4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPR4 = 0 while (g < len(precisaoR4)): somaPR4 = somaPR4 + precisaoR4[g] g = g + 1 Rpt4 = (somaPR4 / len(precisaoR4)) * 100 Rrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallR4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRR4 = 0 while (g < len(recallR4)): somaRR4 = somaRR4 + recallR4[g] g = g + 1 Rrt4 = (somaRR4 / len(recallR4)) * 100 Rfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1R4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFR4 = 0 while (g < len(f1R4)): somaFR4 = somaFR4 + f1R4[g] g = g + 1 Rft4 = (somaFR4 / len(f1R4)) * 100 ''' SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierRodada2 = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", SGDClassifier_classifierRodada2*100) SVC_classifier = SklearnClassifier(SVC()) SVC_classifier.train(training_set) SVC_classifierRodada2 = nltk.classify.accuracy(SVC_classifier, testing_set) print("SVC_classifier accuracy percent:", SVC_classifierRodada2*100) ''' LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) testclas = LinearSVC_classifier.classify_many( [fs for (fs, l) in testing_set]) testgold = [l for (fs, l) in testing_set] Lmc4 = sklearn.metrics.confusion_matrix(testgold, testclas) La4 = (sklearn.metrics.accuracy_score(testgold, testclas)) * 100 Lpp4 = sklearn.metrics.precision_score(testgold, testclas, average=None) * 100 precisaoL4 = sklearn.metrics.precision_score(testgold, testclas, average=None) g = 0 somaPL4 = 0 while (g < len(precisaoL4)): somaPL4 = somaPL4 + precisaoL4[g] g = g + 1 Lpt4 = (somaPL4 / len(precisaoL4)) * 100 Lrp4 = (sklearn.metrics.recall_score(testgold, testclas, average=None)) * 100 recallL4 = sklearn.metrics.recall_score(testgold, testclas, average=None) g = 0 somaRL4 = 0 while (g < len(recallL4)): somaRL4 = somaRL4 + recallL4[g] g = g + 1 Lrt4 = (somaRL4 / len(recallL4)) * 100 Lfp4 = (sklearn.metrics.f1_score(testgold, testclas, average=None)) f1L4 = sklearn.metrics.f1_score(testgold, testclas, average=None) g = 0 somaFL4 = 0 while (g < len(f1L4)): somaFL4 = somaFL4 + f1L4[g] g = g + 1 Lft4 = (somaFL4 / len(f1L4)) * 100 ################# medias #print "## MEDIA ##" #MULTINOMINAL MNBmc = (MNBmc1 + MNBmc2 + MNBmc3 + MNBmc4) / 4 MNBa = (MNBa1 + MNBa2 + MNBa3 + MNBa4) / 4 MNBamax = max([MNBa1, MNBa2, MNBa3, MNBa4]) MNBamin = min([MNBa1, MNBa2, MNBa3, MNBa4]) MNBpp = (MNBpp4 + MNBpp4 + MNBpp4 + MNBpp4) / 4 MNBpt = (MNBpt1 + MNBpt2 + MNBpt3 + MNBpt4) / 4 MNBpmax = max([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBpmin = min([MNBpt1, MNBpt2, MNBpt3, MNBpt4]) MNBrp = (MNBrp1 + MNBrp2 + MNBrp3 + MNBrp4) / 4 MNBrt = (MNBrt1 + MNBrt2 + MNBrt3 + MNBrt4) / 4 MNBrmax = max([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBrmin = min([MNBrt1, MNBrt2, MNBrt3, MNBrt4]) MNBfp = (MNBfp1 + MNBfp2 + MNBfp3 + MNBfp4) / 4 MNBft = (MNBft1 + MNBft2 + MNBft3 + MNBft4) / 4 MNBfmax = max([MNBft1, MNBft2, MNBft3, MNBft4]) MNBfmin = min([MNBft1, MNBft2, MNBft3, MNBft4]) ''' fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.set_aspect('equal') plt.imshow(MNBmc, interpolation='nearest', cmap=plt.cm.ocean) plt.colorbar() plt.show() ''' #REGRESSAO LINEAR Rmc = (Rmc1 + Rmc2 + Rmc3 + Rmc4) / 4 Ra = (Ra1 + Ra2 + Ra3 + Ra4) / 4 Ramax = max([Ra1, Ra2, Ra3, Ra4]) Ramin = min([Ra1, Ra2, Ra3, Ra4]) Rpp = (Rpp4 + Rpp4 + Rpp4 + Rpp4) / 4 Rpt = (Rpt1 + Rpt2 + Rpt3 + Rpt4) / 4 Rpmax = max([Rpt1, Rpt2, Rpt3, Rpt4]) Rpmin = min([Rpt1, Rpt2, Rpt3, Rpt4]) Rrp = (Rrp1 + Rrp2 + Rrp3 + Rrp4) / 4 Rrt = (Rrt1 + Rrt2 + Rrt3 + Rrt4) / 4 Rrmax = max([Rrt1, Rrt2, Rrt3, Rrt4]) Rrmin = min([Rrt1, Rrt2, Rrt3, Rrt4]) Rfp = (Rfp1 + Rfp2 + Rfp3 + Rfp4) / 4 Rft = (Rft1 + Rft2 + Rft3 + Rft4) / 4 Rfmax = max([Rft1, Rft2, Rft3, Rft4]) Rfmin = min([Rft1, Rft2, Rft3, Rft4]) #SVC LINEAR Lmc = (Lmc1 + Lmc2 + Lmc3 + Lmc4) / 4 La = (La1 + La2 + La3 + La4) / 4 Lamax = max([La1, La2, La3, La4]) Lamin = min([La1, La2, La3, La4]) Lpp = (Lpp4 + Lpp4 + Lpp4 + Lpp4) / 4 Lpt = (Lpt1 + Lpt2 + Lpt3 + Lpt4) / 4 Lpmax = max([Lpt1, Lpt2, Lpt3, Lpt4]) Lpmin = min([Lpt1, Lpt2, Lpt3, Lpt4]) Lrp = (Lrp1 + Lrp2 + Lrp3 + Lrp4) / 4 Lrt = (Lrt1 + Lrt2 + Lrt3 + Lrt4) / 4 Lrmax = max([Lrt1, Lrt2, Lrt3, Lrt4]) Lrmin = min([Lrt1, Lrt2, Lrt3, Lrt4]) Lfp = (Lfp1 + Lfp2 + Lfp3 + Lfp4) / 4 Lft = (Lft1 + Lft2 + Lft3 + Lft4) / 4 Lfmax = max([Lft1, Lft2, Lft3, Lft4]) Lfmin = min([Lft1, Lft2, Lft3, Lft4]) ''' print "SVC Linear" print "Matriz de confusão: ", Lmc print "Acuracia: ", La print "Precisão parcial: ", Lpp print "Precisão total: ", Lpt print "Recall parcial: ", Lrp print "Recall total: ", Lrt print "F-medida parcial: ", Lfp print "F-medida total: ", Lft ''' with open(path, mode='w') as csv_file: #writer = csv.writer(csv_file) csv_file.writelines('Algoritmo' + ';' + 'Multinominal Naïve-Bayes' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(MNBa1) + ';' + str(MNBpp1) + ';' + str(MNBpt1) + ';' + str(MNBrp1) + ';' + str(MNBrt1) + ';' + str(MNBfp1) + ';' + str(MNBft1) + '\n') csv_file.writelines('2;' + str(MNBa2) + ';' + str(MNBpp2) + ';' + str(MNBpt2) + ';' + str(MNBrp2) + ';' + str(MNBrt2) + ';' + str(MNBfp2) + ';' + str(MNBft2) + '\n') csv_file.writelines('3;' + str(MNBa3) + ';' + str(MNBpp3) + ';' + str(MNBpt3) + ';' + str(MNBrp3) + ';' + str(MNBrt3) + ';' + str(MNBfp3) + ';' + str(MNBft3) + '\n') csv_file.writelines('4;' + str(MNBa4) + ';' + str(MNBpp4) + ';' + str(MNBpt4) + ';' + str(MNBrp4) + ';' + str(MNBrt4) + ';' + str(MNBfp4) + ';' + str(MNBft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(MNBa) + ';' + str(MNBpp) + ';' + str(MNBpt) + ';' + str(MNBrp) + ';' + str(MNBrt) + ';' + str(MNBfp) + ';' + str(MNBft) + '\n') csv_file.writelines('Máximo;' + str(MNBamax) + "" + ';' + str(MNBpmax) + "" + ';' + str(MNBrmax) + "" + ';' + str(MNBfmax) + '\n') csv_file.writelines('Mínimo;' + str(MNBamin) + "" + ';' + str(MNBpmin) + "" + ';' + str(MNBrmin) + "" + ';' + str(MNBfmin) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Algoritmo' + ';' + 'Regressão Linear' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(Ra1) + ';' + str(Rpp1) + ';' + str(Rpt1) + ';' + str(Rrp1) + ';' + str(Rrt1) + ';' + str(Rfp1) + ';' + str(Rft1) + '\n') csv_file.writelines('2;' + str(Ra2) + ';' + str(Rpp2) + ';' + str(Rpt2) + ';' + str(Rrp2) + ';' + str(Rrt2) + ';' + str(Rfp2) + ';' + str(Rft2) + '\n') csv_file.writelines('3;' + str(Ra3) + ';' + str(Rpp3) + ';' + str(Rpt3) + ';' + str(Rrp3) + ';' + str(Rrt3) + ';' + str(Rfp3) + ';' + str(Rft3) + '\n') csv_file.writelines('4;' + str(Ra4) + ';' + str(Rpp4) + ';' + str(Rpt4) + ';' + str(Rrp4) + ';' + str(Rrt4) + ';' + str(Rfp4) + ';' + str(Rft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(Ra) + ';' + str(Rpp) + ';' + str(Rpt) + ';' + str(Rrp) + ';' + str(Rrt) + ';' + str(Rfp) + ';' + str(Rft) + '\n') csv_file.writelines('Máximo;' + str(Ramax) + "" + ';' + str(Rpmax) + "" + ';' + str(Rrmax) + "" + ';' + str(Rfmax) + '\n') csv_file.writelines('Mínimo;' + str(Ramin) + "" + ';' + str(Rpmin) + "" + ';' + str(Rrmin) + "" + ';' + str(Rfmin) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Algoritmo' + ';' + 'SVC Linear' + '\n') csv_file.writelines('Iteração' + ';' + 'Acurácia' + ';' + 'Precisão parcial' + ';' + 'Precisão total' + ';' + 'revocação parcial' + ';' + 'revocação total' + ';' + 'f-medida parcial' + ';' + 'f-medida total' + '\n') csv_file.writelines('1;' + str(La1) + ';' + str(Lpp1) + ';' + str(Lpt1) + ';' + str(Lrp1) + ';' + str(Lrt1) + ';' + str(Lfp1) + ';' + str(Lft1) + '\n') csv_file.writelines('2;' + str(La2) + ';' + str(Lpp2) + ';' + str(Lpt2) + ';' + str(Lrp2) + ';' + str(Lrt2) + ';' + str(Lfp2) + ';' + str(Lft2) + '\n') csv_file.writelines('3;' + str(La3) + ';' + str(Lpp3) + ';' + str(Lpt3) + ';' + str(Lrp3) + ';' + str(Lrt3) + ';' + str(Lfp3) + ';' + str(Lft3) + '\n') csv_file.writelines('4;' + str(La4) + ';' + str(Lpp4) + ';' + str(Lpt4) + ';' + str(Lrp4) + ';' + str(Lrt4) + ';' + str(Lfp4) + ';' + str(Lft4) + '\n') csv_file.writelines('==================' + '\n') csv_file.writelines('Total' + '\n') csv_file.writelines('Média;' + str(La) + ';' + str(Lpp) + ';' + str(Lpt) + ';' + str(Lrp) + ';' + str(Lrt) + ';' + str(Lfp) + ';' + str(Lft) + '\n') csv_file.writelines('Máximo;' + str(Lamax) + "" + ';' + str(Lpmax) + "" + ';' + str(Lrmax) + "" + ';' + str(Lfmax) + '\n') csv_file.writelines('Mínimo;' + str(Lamin) + "" + ';' + str(Lpmin) + "" + ';' + str(Lrmin) + "" + ';' + str(Lfmin) + '\n')