def dec_or_reg_tree(df_train, df_test, Y): binary = utils.check_binary(df_train[Y]) if binary: newtree = treeHW4.TreeOptimal(max_depth=1) y = list(df_train[Y]) nondf_train = utils.pandas_to_data(df_train) nondf_test = utils.pandas_to_data(df_test) newtree.fit(nondf_train, y) predict = newtree.predict(nondf_train) error_train = mystats.get_error(predict, y, binary) y = utils.pandas_to_data(df_test[Y]) predict = newtree.predict(nondf_test) error_test = mystats.get_error(predict, y) else: node = mytree.Node(np.ones(len(df_train))) hw1.branch_node(node, df_train, 5, Y) model = mytree.Tree(node) predict = model.predict_obj() error_train = mystats.get_error(predict, df_train[Y], binary) node.presence = np.ones(len(df_test)) hw1.test_node(node, df_test, Y) test_tree = mytree.Tree(node) predict = test_tree.predict_obj() error_test = mystats.get_error(predict, df_test[Y], binary) return [error_train, error_test]
def runDigits(n, skclf, myclf): mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) print 'scikit predict' sk_pred = skclf.predict(X_test) print sk_pred print y_test print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def runDigitsDensity(n,_i, j): metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute #skclf = KernelDensity(metric=ma) myclf = hw7u.MyKNN(metric=metric[j], density=True) mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' #skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) #print 'scikit predict' #sk_pred = skclf.predict(X_test) #print sk_pred print y_test print y_pred #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc) print 'My Accuracy: {}'.format(myacc)
def q1a(): """SVM on Spam Data length train: 4140 length test 461 train acc: 0.806763285024 test acc: 0.819956616052 """ data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, svm.SVC(kernel='poly'))
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def relief(n): max_iters = 1 j = 0 i = 1 n_neighbors = [1, 3, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) loops = 0 weights = np.zeros(len(X[0])) loops += 1 n_features = len(X[0]) n_samples = len(X) for j in range(n_features): #feature for i in range(n_samples): # data closest_same = None closest_opp = None for z_i in range(n_samples): if z_i == i: continue diff = (X[z_i][j] - X[i][j]) ** 2 if y[z_i] == y[i]: # same if closest_same is None or diff < closest_same: closest_same = diff else: # opp if closest_opp is None or diff < closest_opp: closest_opp = diff weights[j] += (-closest_same + closest_opp) if i % 1000 == 0: print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples) print weights return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def runSpamDensity(_i, j, features='all'): metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density'] data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print(len(X)) print(len(X_test)) myclassifier = hw7u.MyKNN(metric=metric[j], density=True) print 'start MyKNN' myclassifier.fit(X, y) #print 'start scikit' #knnsci = skclassifier.fit(X, y) print 'start my pred' y_pred = myclassifier.predict(X_test) print(y_pred) #print 'start sk pred' #y_sci = knnsci.score(X_test) #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred))) print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
def test_mnist_load(): data = pd.read_csv('df_save_img_everything.csv') print len(data) X = utils.pandas_to_data(data) print X[0]
def q2_kernel_poly(): # c = 1, tol = 1-e2, passes = 1 # data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('poly')))
def q2(): # c= 1, tol = 1-e2, passes = 1 # train acc: 0.904830917874 test acc: 0.917570498915 data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) svm_q1(data, mysvm.SVC(mysvm.SMO, Kernel('linear')))