import numpy import load_data import representation from sklearn import svm from scipy.spatial.distance import correlation from scipy.spatial.distance import cosine from scipy.spatial.distance import euclidean #### input paths trainFilePath = '../Data/train-data.txt' devFilePath = '../Data/dev-data.txt' testFilePath = '../Data/task_2_test_set_to_release.txt' #### loading data trainClassID, trainTweetTxt = load_data.get_train(trainFilePath) devClassID, devTweetTxt = load_data.get_dev(devFilePath) testTweedID, testTweetTxt = load_data.get_test(testFilePath) trainClassID = trainClassID + devClassID #### representing as a matrix mini_df = 1 for k in range (1, 2): trainDTMatirix, devDTMatirix, testDTMatirix = representation.get_dtm(trainTweetTxt, devTweetTxt, testTweetTxt, k) ## trainDTMatirix, devDTMatirix, testDTMatirix = representation.get_tdidf(trainTweetTxt, devTweetTxt, testTweetTxt, k) trainDTMatirix = trainDTMatirix.todense() devDTMatirix = devDTMatirix.todense() testDTMatirix = testDTMatirix.todense() trainDTMatirix = numpy.concatenate((trainDTMatirix, devDTMatirix))
We also apply a nonlinear transform (x1, x2) -> (x1, x2, x1^2, x2^2, x1*x2, |x1 - x2|, |x1 + x2|). We compare our error with and without regularization ''' def nonlinear_transform(x): f = lambda x: [ x[0], x[1], x[0]**2, x[1]**2, x[0] * x[1], abs(x[0] - x[1]), abs(x[0] + x[1]) ] return np.array([f(x_i) for x_i in x]) if __name__ == '__main__': x_train, y_train = get_train() x_test, y_test = get_test() x_train = nonlinear_transform(x_train) x_test = nonlinear_transform(x_test) lrc = h2.LinearRegressionClassifier(7, x_train, y_train) in_sample_err = h2.calc_error_rate(y_train, lrc.classify(x_train)) out_sample_err = h2.calc_error_rate(y_test, lrc.classify(x_test)) h2.boldprint("Without Regularization:") print "In sample err", in_sample_err print "Out of sample err", out_sample_err _lambda = .5 lrc = h2.LinearRegressionClassifier(7, x_train, y_train,