/
run_kde_crossval.py
executable file
·165 lines (142 loc) · 7.02 KB
/
run_kde_crossval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import tools.data
from tools.data import *
import tools.kde
import tools.util
from tools.util import *
from tools.kde import *
import classifiers.PCC
from classifiers.PCC import *
import pdb
import numpy
import traceback
import sys
import sklearn
from sklearn import cross_validation
from tools.StratifiedShuffleSplit import *
print "NOTE: Numpy, Scipy and Scikit are needed to run this file."
print " Installation instructions are in 'setup_commands.txt'"
print
def predictkde(X, W, P, ytrain):
Yte1 = dot(array(X),W)
Yte1 = dot(Yte1,P.T)
Yte1 = normalize_scale(Yte1, array(ytrain))
# convert to binary
yp = []
for i in range(len(Yte1)):
ypi = []
for j in range(len(Yte1[i])):
if Yte1[i][j] >= 0.0:
ypi.append(1.0)
else:
ypi.append(0.0)
yp.append(ypi)
yp = array(yp)
Yte1 = (Yte1 + 1)/2.0
return [Yte1, yp]
def processDataset(datafile_train, datafile_test, resultfile_train, resultfile_test, p, k):
logf = logging.getLogger(__name__)
[x_full,y_full] = readData(datafile_train, p, k)
yt=array(y_full)
y = reorder(yt, [0, 6, 4, 9, 1, 7, 2, 11, 8, 5, 3, 10])
x = array(normalize_scale(x_full))
#[xtest_f,ytest_f] = readData(datafile_test, p, k)
#ytest=ytest_f
#xtest = normalize_scale(xtest_f, x_full)
#outf_train = open(resultfile_train, "w")
#outf_test = open(resultfile_test, "w")
ystrat = stratifier(y, 5)
skfold = StratifiedKFold(ystrat, 5)
lossHl = []; lossSl = []; lossRl = []; lossNrl = []; lossOe = []; lossAvprec = [];
for train, test in skfold:
xx = x[train]; yy = y[train]
ystrat2 = stratifier(yy, 2)
bestC = 2**-14
bestMSE = 10000000000
for C in [2**i for i in range(-14, 14, 4)]:
sss = StratifiedShuffleSplit(ystrat2, 2, test_size=0.2, random_state=16)
squaredErrors = []
for train_index, test_index in sss: # 2 times
xtr = xx[train_index]
ytr = yy[train_index]
W,P,_meanY = kde(xtr,ytr,C)
[yp_p, yp] = predictkde(xx[test_index], W, P, ytr)
squaredError = mse(yp_p, yy[test_index])
squaredErrors.append(squaredError)
meanSquaredError = mean(squaredErrors)
if meanSquaredError < bestMSE:
bestMSE = meanSquaredError
bestC = C
#train based on bestC
W,P,_meanY = kde(xx,yy,bestC)
# predict
print "predicting..."
[yp_p, yp] = predictkde(x[test], W, P, yy)
[hl, sl, rl, nrl, oe, avprec] = computeMetrics(yp, yp_p, y[test])
lossHl.append(hl); lossSl.append(sl); lossRl.append(rl);
lossNrl.append(nrl); lossOe.append(oe); lossAvprec.append(avprec);
print "After training, average performance over 5 folds:"
print "\tHL: ",array(lossHl).mean(), " +- ", array(lossHl).std()
print "\tSL: ",array(lossSl).mean(), " +- ", array(lossSl).std()
print "\tRL: ",array(lossRl).mean(), " +- ", array(lossRl).std()
print "\tNRL: ",array(lossNrl).mean(), " +- ", array(lossNrl).std()
print "\tOE: ",array(lossOe).mean(), " +- ", array(lossOe).std()
print "\tAVPREC: ",array(lossAvprec).mean(), " +- ", array(lossAvprec).std()
'''
for inferenceb in [1]:
#for inferenceb in [2**len(ytest[0])]:
clf.b = inferenceb
#yp = clf.predict(array(x))
#yp_p = clf.predict_proba(array(x))
#[hl, sl, rl] = computeMetrics(yp, yp_p, y)
#ll = logLoss(yp_p, y)
#logf.info("\tInference Beam size " + str(inferenceb) + " Training Set Hamming Loss: " + str(hl))
#logf.info("\tInference Beam Size " + str(inferenceb) + " Training Set 0-1 Loss: " + str(sl))
#logf.info("\tInference Beam Size " + str(inferenceb) + " Training Set Rank Loss: " + str(rl))
#logf.info("\tInference Beam Size " + str(inferenceb) + " Training Set Log Loss: " + str(ll))
#print >>outf_train, str(inferenceb) + "\t" + str(hl) + "\t" + str(sl) + "\t" + str(rl) + "\t" + str(ll)
yp = clf.predict(array(xtest))
yp_p = clf.predict_proba(array(xtest))
[hl, sl, rl, nrl, oe, avprec] = computeMetrics(yp, yp_p, ytest)
#ll = logLoss(yp_p, ytest)
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set Hamming Loss: " + str(hl))
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set 0-1 Loss: " + str(sl))
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set Rank Loss: " + str(rl))
#logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set Log Loss: " + str(ll))
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set Normalized Rank Loss: " + str(nrl))
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set One-Error: " + str(oe))
logf.info("\tInference Beam Size " + str(inferenceb) + " Test Set Avg Prec: " + str(avprec))
print >>outf_test, str(inferenceb) + "\t" + str(hl) + "\t" + str(sl) + "\t" + str(rl) + "\t" + str(nrl) + "\t" + str(oe) + "\t" + str(avprec)
#pdb.set_trace()
outf_train.close()
outf_test.close()
'''
if __name__ == "__main__":
numpy.seterr(all='raise')
logging.basicConfig(filename="./output/KDECrossVal.log", level=logging.DEBUG)
logf = logging.getLogger(__name__)
if not os.path.exists("./output"): os.makedirs("./output")
try:
#logf.info("Started processing the scene dataset")
#processDataset("../../data/scene/scene-train.csv", "../../data/scene/scene-test.csv", "./output/scene-train-fixedpcc.txt", "./output/scene-test-fixedpcc.txt", 294, 6)
#logf.info("Finished processing the scene dataset")
#logf.info("Started processing the yeast dataset")
#processDataset("../../data/yeast/yeast-train.csv", "../../data/yeast/yeast-test.csv", "./output/yeast-train-fixedpcc.txt", "./output/yeast-test-fixedpcc.txt", 103, 14)
#logf.info("Finished processing the yeast dataset")
#logf.info("Started processing the medical dataset")
#processDataset("../../data/medical/medical-train.csv", "../../data/medical/medical-test.csv", "./output/medical-train-fixedpcc.txt", "./output/medical-test-fixedpcc.txt", 1449, 45)
#logf.info("Finished processing the medical dataset")
#logf.info("Started processing the genbase dataset")
#processDataset("../../data/genbase/genbase-train.csv", "../../data/genbase/genbase-test.csv", "./output/genbase-train-fixedpcc.txt", "./output/genbase-test-fixedpcc.txt", 1186,26)
#logf.info("Finished processing the genbase dataset")
#logf.info("Started processing the enron dataset")
#processDataset("../../data/enron/enron-train.csv", "../../data/enron/enron-test.csv", "./output/enron-train-fixedpcc.txt", "./output/enron-test-fixedpcc.txt", 1001,53)
#logf.info("Finished processing the enron dataset")
logf.info("Started processing the movie dataset")
processDataset("../../data/moviegenre/moviegenre-all.csv", "../../data/moviegenre/moviegenre-all.csv", "./output/movie-train-fixedpcc.txt", "./output/movie-test-fixedpcc.txt", 4904,12)
logf.info("Finished processing the movie dataset")
#logf.info("Started processing the emotions dataset")
#processDataset("../../data/emotions/emotions-train.csv", "../../data/emotions/emotions-test.csv", "./output/emotions-train-fixedpcc.txt", "./output/emotions-test-fixedpcc.txt", 72,6)
#logf.info("Finished processing the emotion dataset")
except Exception:
traceback.print_exc(file=sys.stdout)
pdb.set_trace()