def processDataset(datafile_train, datafile_test, resultfile_train, resultfile_test, p, k): logf = logging.getLogger(__name__) [x_full,y_full] = readData(datafile_train, p, k) yt=array(y_full) y = reorder(yt, [0, 6, 4, 9, 1, 7, 2, 11, 8, 5, 3, 10]) x = array(normalize_scale(x_full)) #[xtest_f,ytest_f] = readData(datafile_test, p, k) #ytest=ytest_f #xtest = normalize_scale(xtest_f, x_full) #outf_train = open(resultfile_train, "w") #outf_test = open(resultfile_test, "w") ystrat = stratifier(y, 5) skfold = StratifiedKFold(ystrat, 5) lossHl = []; lossSl = []; lossRl = []; lossNrl = []; lossOe = []; lossAvprec = []; for train, test in skfold: xx = x[train]; yy = y[train] ystrat2 = stratifier(yy, 2) bestC = 2**-14 bestMSE = 10000000000 for C in [2**i for i in range(-14, 14, 4)]: sss = StratifiedShuffleSplit(ystrat2, 2, test_size=0.2, random_state=16) squaredErrors = [] for train_index, test_index in sss: # 2 times xtr = xx[train_index] ytr = yy[train_index] W,P,_meanY = kde(xtr,ytr,C) [yp_p, yp] = predictkde(xx[test_index], W, P, ytr) squaredError = mse(yp_p, yy[test_index]) squaredErrors.append(squaredError) meanSquaredError = mean(squaredErrors) if meanSquaredError < bestMSE: bestMSE = meanSquaredError bestC = C #train based on bestC W,P,_meanY = kde(xx,yy,bestC) # predict print "predicting..." [yp_p, yp] = predictkde(x[test], W, P, yy) [hl, sl, rl, nrl, oe, avprec] = computeMetrics(yp, yp_p, y[test]) lossHl.append(hl); lossSl.append(sl); lossRl.append(rl); lossNrl.append(nrl); lossOe.append(oe); lossAvprec.append(avprec); print "After training, average performance over 5 folds:" print "\tHL: ",array(lossHl).mean(), " +- ", array(lossHl).std() print "\tSL: ",array(lossSl).mean(), " +- ", array(lossSl).std() print "\tRL: ",array(lossRl).mean(), " +- ", array(lossRl).std() print "\tNRL: ",array(lossNrl).mean(), " +- ", array(lossNrl).std() print "\tOE: ",array(lossOe).mean(), " +- ", array(lossOe).std() print "\tAVPREC: ",array(lossAvprec).mean(), " +- ", array(lossAvprec).std() '''
def processDataset(datafile_train, datafile_test, resultfile_train, resultfile_test, p, k): logf = logging.getLogger(__name__) [x_full,y_full] = readData(datafile_train, p, k) y=array(y_full) x = array(normalize_scale(x_full)) [xtest_f,ytest_f] = readData(datafile_test, p, k) ytest=array(ytest_f) xtest = array(normalize_scale(xtest_f, x_full)) outf_test = open(resultfile_test, "w") logf.info("Training KDE...") ystrat = stratifier(y, 5) bestC = 2**-14 bestMSE = 10000000000 for C in [2**i for i in range(-14, 14, 1)]: sss = StratifiedShuffleSplit(ystrat, 5, test_size=0.2, random_state=16) squaredErrors = [] for train_index, test_index in sss: # 5 times xtr = x[train_index] ytr = y[train_index] W,P,_meanY = kde(xtr,ytr,C) [yp_p, yp] = predictkde(x[test_index], W, P, ytr) squaredError = mse(yp_p, y[test_index]) squaredErrors.append(squaredError) meanSquaredError = mean(squaredErrors) if meanSquaredError < bestMSE: bestMSE = meanSquaredError bestC = C #train based on bestC W,P,_meanY = kde(x,y,bestC) logf.info("Training complete. Best C: " + str(bestC) + "\tAverage MSE using CV: " + str(bestMSE)) #predict on the final test set [yp_p, yp] = predictkde(xtest, W, P, y) [hl, sl, rl, nrl, oe, avprec] = computeMetrics(yp, yp_p, ytest) #ll = logLoss(yp_p, ytest) logf.info("KDE: Test Set Hamming Loss: " + str(hl)) logf.info("KDE Test Set 0-1 Loss: " + str(sl)) logf.info("KDE Test Set Rank Loss: " + str(rl)) #logf.info("KDE Test Set Log Loss: " + str(ll)) logf.info("KDE Test Set Normalized Rank Loss: " + str(nrl)) logf.info("KDE Test Set One-Error: " + str(oe)) logf.info("KDE Test Set Avg Prec: " + str(avprec)) print >>outf_test, "KDE\t" + str(hl) + "\t" + str(sl) + "\t" + str(rl) outf_test.close()