/
classifier.py
103 lines (72 loc) · 2.86 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
__author__ = 'rwechsler'
from sklearn import svm
from sklearn import naive_bayes
from corpora import get_utterances_from_file
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sknn import mlp
import numpy as np
def NB_classifier(X_train, Y_train, X_test, Y_test):
model = naive_bayes.BernoulliNB()
model.fit(X_train, Y_train)
score = model.score(X_test, Y_test)
return score
def SVM_classifier(X_train, Y_train, X_test, Y_test):
SVM_model = svm.SVC(probability=False, kernel='rbf', C=1.0)
SVM_model.fit(X_train, Y_train)
score = SVM_model.score(X_test, Y_test)
return score
def KNN_classifier(X_train, Y_train, X_test, Y_test):
KNN_model = KNeighborsClassifier(n_neighbors=5)
KNN_model.fit(X_train,Y_train)
score = KNN_model.score(X_test,Y_test)
return score
def MLP_classifier(X_train, Y_train, X_test, Y_test, n_iter=25):
nn = mlp.Classifier(layers=[mlp.Layer("Sigmoid", units=100), mlp.Layer("Softmax")], learning_rate=0.001, n_iter=n_iter)
nn.fit(np.array(X_train), np.array(Y_train))
score = nn.score(np.array(X_test), np.array(Y_test))
return score
def get_swda_labeled_utterances():
X_tokens = []
Y_tags = []
for tag, tokens in get_utterances_from_file("data/swda_file.txt"):
X_tokens.append(" ".join(tokens))
# remove id from tag
tag = tag.split("/")[0]
Y_tags.append(tag)
return X_tokens, Y_tags
def get_BOW_from_utterances(X_tokens):
# scikit learn BOW
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(X_tokens) # returns a sparse matrix
return X
def encode_tags(tags):
# encode tags to ints
le = preprocessing.LabelEncoder()
le.fit(tags) # fit data
Y = le.transform(tags) # return normalized tags
return Y
def split_datasets(X, Y):
# out of the box scikit method for splitting train/test. Get 10%
return train_test_split(X, Y, test_size=0.1)
if __name__=='__main__':
# get utterance tokens and tags from swda corpus
X_tokens, Y_tags = get_swda_labeled_utterances()
# get BOW for utterances
X = get_BOW_from_utterances(X_tokens)
# encode tags as ints
Y = encode_tags(Y_tags)
# get training and validation sets
X_train, X_test, Y_train, Y_test = split_datasets(X,Y)
# run classifiers
NB_score = NB_classifier(X_train, Y_train, X_test, Y_test)
# SVM_score = SVM_classifier(X_train, Y_train, X_test, Y_test)
KNN_score = KNN_classifier(X_train, Y_train, X_test, Y_test)
print 'NB test set score ', NB_score
print 'KNN test set score: ', KNN_score
# predicted = model.predict(X_test)
# true = le.inverse_transform(Y_test)
# for i, pred in enumerate(le.inverse_transform(predicted)):
# print pred, true[i]