def UnBiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \ kernely=vector.CLinearKernel()): """ Compute the UNbiased estimator of HSIC. Args: x: The data. y: The labels. kernelx: The kernel on the data, default to linear kernel. kernely: The kernel on the labels, default to linear kernel. Returns: HSIC score """ nx = x.shape ny = y.shape assert nx[0] == ny[0], \ "Argument 1 and 2 have different number of data points" kMat = kernelx.Dot(x, x) setdiag0(kMat) lMat = kernely.Dot(y, y) setdiag0(lMat) sK = kMat.sum(axis=1) ssK = sK.sum() sL = lMat.sum(axis=1) ssL = sL.sum() return (kMat.__imul__(lMat).sum() + \ (ssK * ssL) / ((nx[0] - 1) * (nx[0] - 2)) - \ 2 * sK.__imul__(sL).sum() / (nx[0] - 2) \ ) / (nx[0] * (nx[0] - 3))
def BiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \ kernely=vector.CLinearKernel()): nx = x.shape ny = y.shape assert nx[0] == ny[0], \ "Argument 1 and 2 have different number of data points" if len(nx) > 1: kMat = kernelx.Dot(x, x) else: kMat = numpy.outerproduct(x, x) hlhMat = ComputeHLH(y, kernely) return numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0] - 1) * (nx[0] - 1))
def ComputeHLH(self, y, kernely=vector.CLinearKernel()): ny = y.shape if len(ny) > 1: lMat = kernely.Dot(y, y) else: lMat = numpy.outerproduct(y, y) sL = numpy.sum(lMat, axis=1) ssL = numpy.sum(sL) # hlhMat return lMat - numpy.add.outer(sL, sL) / ny[0] + ssL / (ny[0] * ny[0])
def BiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), kernely=vector.CLinearKernel()): """ Compute the biased estimator of HSIC. @param x The data. @param y The labels. @param kernelx The kernel on the data, default to linear kernel. @param kernely The kernel on the labels, default to linear kernel. """ nx = x.shape ny = y.shape assert nx[0] == ny[0], \ "Argument 1 and 2 have different number of data points" if len(nx) > 1: kMat = kernelx.Dot(x, x) else: kMat = numpy.outerproduct(x, x) hlhMat = self.ComputeHLH(y, kernely) return numpy.sum(numpy.sum(kMat * hlhMat)) / ((nx[0] - 1) * (nx[0] - 1))
def UnBiasedHSIC(self, x, y, kernelx=vector.CLinearKernel(), \ kernely=vector.CLinearKernel()): nx = x.shape ny = y.shape assert nx[0] == ny[0], \ "Argument 1 and 2 have different number of data points" kMat = kernelx.Dot(x, x) setdiag0(kMat) lMat = kernely.Dot(y, y) setdiag0(lMat) sK = kMat.sum(axis=1) ssK = sK.sum() sL = lMat.sum(axis=1) ssL = sL.sum() return ( kMat.__imul__(lMat).sum() + \ (ssK*ssL)/((nx[0]-1)*(nx[0]-2)) - \ 2 * sK.__imul__(sL).sum() / (nx[0]-2) \ ) / (nx[0]*(nx[0]-3))
def ComputeHLH(self, y, kernely=vector.CLinearKernel()): """ Compute HLH give the labels. @param y The labels. @param kernely The kernel on the labels, default to linear kernel. """ ny = y.shape if len(ny) > 1: lMat = kernely.Dot(y, y) else: lMat = numpy.outerproduct(y, y) sL = numpy.sum(lMat, axis=1) ssL = numpy.sum(sL) # hlhMat return lMat - numpy.add.outer(sL, sL) / ny[0] + ssL / (ny[0] * ny[0])
def main(): X = load('xdata.h5') y1 = load('y1data.h5') y2 = load('y2data.h5') y3 = load('y3data.h5') y4 = load('y4data.h5') # print X.shape # print y1.shape data_no = X.shape[0] y1 = normalize(y1, data_no) y2 = normalize(y2, data_no) y3 = normalize(y3, data_no) y4 = normalize(y4, data_no) # print y # Normalize the data. m = X.mean(0) s = X.std(0) # print m,s X.__isub__(m).__idiv__(s) # foo=np.copy(X) xlabels = [] ylabels = [] foo = np.copy(X) for features_tokeep in range(5000, 10001, 1000): X = np.copy(foo) xlabels.append(features_tokeep) print xlabels bahsic = CBAHSIC() bhs = bahsic.BAHSICRaw(X, y1, vector.CLinearKernel(), vector.CLinearKernel(), features_tokeep, 0.1) hsicfeatures = np.zeros(shape=(data_no, features_tokeep)) for i in range(0, data_no): for j in range(0, features_tokeep): hsicfeatures[i][j] = X[i][bhs[features_tokeep + j]] X = hsicfeatures print X.shape # y1=y1.reshape((y1.shape[0],)) # print y1.shape #feature extraction # test train split # xlabels=[] # ylabels=[] # y1labels=[] # y2labels=[] # y3labels=[] # y4labels=[] # pca = PCA(n_components=pcanum) # X=pca.fit_transform(foo) # print X.shape labels = list(zip(y1, y2, y3, y4)) xtrain, xtest, ytrain, ytest = train_test_split(X, labels, test_size=0.33, random_state=42) tmp = zip(*ytest) ytest1 = tmp[0] ytest2 = tmp[1] ytest3 = tmp[2] ytest4 = tmp[3] ytest1 = np.array(ytest1) ytest2 = np.array(ytest2) ytest3 = np.array(ytest3) ytest4 = np.array(ytest4) tmp = zip(*ytrain) ytrain1 = tmp[0] ytrain2 = tmp[1] ytrain3 = tmp[2] ytrain4 = tmp[3] ytrain1 = np.array(ytrain1) ytrain2 = np.array(ytrain2) ytrain3 = np.array(ytrain3) ytrain4 = np.array(ytrain4) # clf=SVR(kernel='rbf') clf = LinearRegression() clf.fit(xtrain, ytrain1) vals1 = clf.predict(xtest) # print vals1 # print accuracyregression(ytest1,vals1) # clf=SVR(kernel='linear',C=1e3) clf.fit(xtrain, ytrain2) vals2 = clf.predict(xtest) # print accuracyregression(ytest2,vals2) # clf=SVR(kernel='linear',C=1e3) clf.fit(xtrain, ytrain3) vals3 = clf.predict(xtest) # print accuracyregression(ytest3,vals3) # clf=SVR(kernel='linear',C=1e3) clf.fit(xtrain, ytrain4) vals4 = clf.predict(xtest) # print accuracyregression(ytest4,vals4) truth = np.zeros((4, len(ytest1))) predicted = np.zeros((4, len(vals1))) for j in range(len(ytest1)): truth[0][j] = ytest1[j] predicted[0][j] = vals1[j] truth[1][j] = ytest2[j] predicted[1][j] = vals2[j] truth[2][j] = ytest3[j] predicted[2][j] = vals3[j] truth[3][j] = ytest4[j] predicted[3][j] = vals4[j] ylabels.append(accuracyclassification(truth, predicted)) # print accuracyclassification(truth,predicted) print xlabels print ylabels plt.plot(xlabels, ylabels) plt.savefig("classificationbasic1.png")
# Normalize the labels. y = 1.0 * y tmp_no = numpy.sum(y) pno = (data_no + tmp_no) / 2 nno = (data_no - tmp_no) / 2 y[y > 0] = y[y > 0] / pno y[y < 0] = y[y < 0] / nno # Normalize the data. m = X.mean(0) s = X.std(0) X.__isub__(m).__idiv__(s) t1 = time.clock() tmp = bahsic.BAHSICRaw(X, y, vector.CLinearKernel(), vector.CLinearKernel(), features_tokeep, 0.1) t2 = time.clock() print "time taken: " + str(t2 - t1) print '--rank of the features' print '--better features towards the end of the list:' print tmp hsicfeatures = numpy.zeros(shape=(data_no, features_tokeep)) for i in range(0, data_no): for j in range(0, features_tokeep): hsicfeatures[i][j] = X[i][tmp[features_tokeep + j]] numpy.savetxt(file_out, hsicfeatures) if (sys.argv == 5): numpy.savetxt('original.csv', X)