from sklearn import svm from clean_data import CleanData __author__ = "Yacine Sibous" cd = CleanData() print "Getting the training data." training_data = cd.bag_of_words(in_file="data/clean_train_input.csv") print "Done collecting data." X = [x[1] for x in training_data] y = [y[2] for y in training_data] print X[0:5] print y[0:5] clf = svm.SVC() clf.fit(X, y)
def plot_feature_size(num_iter): """Tests various feature sizes and plots the error. Args: num_iter: Number of times to test for each point. """ points = [100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000] errors = [] train_errors = [] # Iterate over all points defined. for point in points: print "Testing for point", point, "features." error = 0 train_error = 0 # Repeat the test the desired number of times. for i in range(0, num_iter): cd = CleanData(tfidf=True, max_train_size=25000, max_features=point) try: # Get and train data. training_data = cd.bag_of_words(in_file="data/clean_train_input.csv") ids, X, y = get_numpy_matrices(training_data) del training_data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) del X, y, ids nb = NaiveBayes() nb.train(X_train, y_train) # Calculate training and validation errors. out = nb.classify(X_test) error += nb.compute_error(out, y_test) train_out = nb.classify(X_train) train_error += nb.compute_error(train_out, y_train) except MemoryError: print "Memory error. Continuing." continue del X_train, X_test, y_train, y_test errors.append(error / num_iter) train_errors.append(train_error / num_iter) # PLOT. plt.figure(2) plt.title("Error vs Features") plt.xlabel("Number of features") plt.ylabel("Error") # plt.xscale('log') plt.plot(points, errors, '-ro') plt.plot(points, train_errors, '-bo') plt.show()
from sklearn import svm from sklearn.cross_validation import cross_val_score from clean_data import CleanData import numpy as np import csv __author__ = "Yacine Sibous, Jana Pavlasek" # Initialize data for final submission. cd = CleanData(tfidf=True, max_features=2500000, n_grams=3) # Geat features and output. print 'Getting Training data.' X, y = cd.bag_of_words(in_file="data/clean_train_input.csv", sparse=True) print 'Done collecting data.' # Train. print 'Training the model.' lin_clf = svm.LinearSVC() lin_clf.fit(X, y) print 'Done training.' # 3-fold cross validation. print 'Cross Validation' c_validation = cross_val_score(lin_clf, X, y, scoring='accuracy') print c_validation.mean() # Get and predict on the final test data. print 'Collecting test data.' test = cd.get_x_in(sparse=True) print 'Done collecting data.'