-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_selecao.py
82 lines (65 loc) · 2.99 KB
/
training_selecao.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import numpy as np
import gensim
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import gc
import scipy.sparse as sp
import time
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from representations import Representations
from Models import Models
from obter_dados import dados
from tratar_texto import limpe_texto
from sklearn.model_selection import train_test_split
from pickle import dump, load
def Classify(X, Y, cls, rep, k=5000):
# Start moment
Start_moment = time.time()
title = 'Classificando com {} e {} k={}'.format(cls, rep, k)
print(title)
# Creating the K-fold cross validator
if 'w2v' in rep:
train_x = load(open('w2v_rep/{}_train_x.pkl'.format(rep), 'rb'))
train_y = load(open('w2v_rep/{}_train_y.pkl'.format(rep), 'rb'))
test_x = load(open('w2v_rep/{}_test_x.pkl'.format(rep), 'rb'))
test_y = load(open('w2v_rep/{}_test_y.pkl'.format(rep), 'rb'))
else:
X_train, X_test, y_train, y_test = train_test_split(X, Y,test_size=0.2,random_state=123,stratify=Y)
train_x, train_y, test_x, test_y = Representations().get_representation(rep=rep, train_x=X_train, train_y=y_train,
test_x=X_test, test_y=y_test, k=k, cat=None)
sm = SMOTE(sampling_strategy='minority',
random_state=None)
train_x, train_y = sm.fit_sample(train_x, train_y)
# dump(train_x, open('w2v_rep/{}_train_x.pkl'.format(rep), 'wb'))
# dump(train_y, open('w2v_rep/{}_train_y.pkl'.format(rep), 'wb'))
# dump(test_x, open('w2v_rep/{}_test_x.pkl'.format(rep), 'wb'))
# dump(test_y, open('w2v_rep/{}_test_y.pkl'.format(rep), 'wb'))
# return
classifier = Models().get_classifier(cls)
classifier.fit(train_x, train_y)
# Train_Classifier(classifier, X_train, Y_train)
pred = classifier.predict(test_x)
# report = classification_report(test_labels, test_pred, target_names=['Contrário', 'Favorável'] if plb =='polaridade' else ['neutro', 'opiniao'])
report = classification_report(test_y, pred,
target_names=['no','yes'])
print(report)
Finish_moment = time.time()
tm = "It took " + str((Finish_moment - Start_moment)) + " seconds"
print(tm)
classificadores = ['MLP']
representacao = ['w2v_mean', 'w2v']
textos, tags = dados()
print(len(textos))
textos, tags = limpe_texto(textos, tags)
print(len(textos))
print(len([t for t in tags if t ==0]))
# X_train, X_validacao, y_train, y_validacao = train_test_split(textos, tags,test_size=0.2,
# random_state=123,stratify=tags)
for k in range(5000, 6000, 1000):
for c in classificadores:
for r in representacao:
# Classify(X_train, y_train, c, r, k)
Classify(textos, tags, c, r, k)