def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def split_truth_from_data(data): """ Assumes that the truth column is the last column """ truth_rows = utils.transpose_array(data)[-1] # truth is by row data_rows = utils.transpose_array(utils.transpose_array(data)[:-1]) # data is by column for i in range(len(truth_rows)): if truth_rows[i] == 0: truth_rows[i] = -1 return truth_rows, data_rows
def normalize_data(X, skip=None): if skip is not None and skip < 0: skip += len(X[0]) by_col = utils.transpose_array(X) normalized = [] for j in range(len(by_col)): if skip != j: new_col, is_singular = normalize_col(by_col[j]) normalized.append(new_col) return utils.transpose_array(normalized)
def clean_data(X, remove_constant=False): by_col = utils.transpose_array(X) nan_rows = [] new_by_col = [] for i, x in enumerate(by_col): col, bad_rows, is_singular = check_type(x) for b in bad_rows: if b not in nan_rows: nan_rows.append(b) if not is_singular or not remove_constant: new_by_col.append(col) upright = utils.transpose_array(new_by_col) new_X = [] for i, row in enumerate(upright): if i not in nan_rows: new_X.append(row) return new_X
def check_cols(X): by_col = utils.transpose_array(X) msk = [] for c in range(len(by_col)): col = by_col[c] if np.std(col) == 0: #print col #print '{} std_dev is 0'.format(c) msk.append(c) return msk
def get_initial_thresholds(self, data): by_col = utils.transpose_array(data) thresholds = [] start = 100 for j in range(len(by_col)): col_thresholds = [] feature_j = [float(i) for i in np.array(by_col[j])] values = list(set(feature_j)) values.sort() col_thresholds.append(values[0] - .01) for i in range(1, len(values)): mid = (values[i] - values[i-1])/2 col_thresholds.append(values[i-1] + mid) col_thresholds.append(values[-1] + .01) thresholds.append(col_thresholds) return thresholds
def split_presence_array(self, X, column, threshold): array_l = [] array_r = [] by_col = utils.transpose_array(X) data = by_col[column] for i in range(len(data)): if self.presence_array[i] == 1: if data[i] > threshold: array_l.append(0) array_r.append(1) else: array_l.append(1) array_r.append(0) else: array_l.append(0) array_r.append(0) return array_l, array_r
def choose_best_feature(self): by_col = utils.transpose_array(self.data_subset) max_info_gain = -1 min_weighted_error = 1.5 best_col = None col_threshold = None for j in range(len(by_col)): info_gain, threshold, weighted_error = self.compute_info_gain(by_col[j], self.truth_subset) #TODO - fix objective function so it is organized #if info_gain > max_info_gain: # best_ig_col = j # max_info_gain = info_gain # col_threshold = threshold if weighted_error < min_weighted_error: best_col = j min_weighted_error = weighted_error max_info_gain = info_gain col_threshold = threshold if best_col is None: print "BEST COL is NONE" self.print_branch(False) return best_col, max_info_gain, col_threshold