Esempio n. 1
0
def SpamClassifier(features, skclassifier, myclassifier):
    data = utils.pandas_to_data(utils.load_and_normalize_spam_data())
    k = 10
    if features != 'all':
        # Only use the features passed in the features array
        new = []
        t = utils.transpose_array(data)
        for i in xrange(len(t)):
            if i in features:
                new.append(t[i])
            data = utils.transpose_array(t)
    all_folds = hw3u.partition_folds(data, k)
    kf_train, kf_test = dl.get_train_and_test(all_folds, 0)
    y, X = hw4u.split_truth_from_data(kf_train)
    y_test, X_test = hw4u.split_truth_from_data(kf_test)
    print 'start MyKNN'
    knn = hw7u.KNN(classifier=myclassifier)
    print 'start scikit'
    knnsci = hw7u.KNN(classifier=skclassifier)
    print 'start my pred'
    y_pred = knn.predict(X_test, X, y)
    print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))
    print 'start sk pred'
    y_sci = knnsci.predict(X_test, X, y)
    print 'SciKit Accuracy: {}  My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def split_truth_from_data(data):
    """ Assumes that the truth column is the last column """
    truth_rows = utils.transpose_array(data)[-1]  # truth is by row
    data_rows = utils.transpose_array(utils.transpose_array(data)[:-1])  # data is by column
    for i in range(len(truth_rows)):
        if truth_rows[i] == 0:
            truth_rows[i] = -1
    return truth_rows, data_rows
def normalize_data(X, skip=None):
    if skip is not None and skip < 0:
        skip += len(X[0])
    by_col = utils.transpose_array(X)
    normalized = []
    for j in range(len(by_col)):
        if skip != j:
            new_col, is_singular = normalize_col(by_col[j])
            normalized.append(new_col)
    return utils.transpose_array(normalized)
def clean_data(X, remove_constant=False):
    by_col = utils.transpose_array(X)
    nan_rows = []
    new_by_col = []
    for i, x in enumerate(by_col):
        col, bad_rows, is_singular = check_type(x)
        for b in bad_rows:
            if b not in nan_rows:
                nan_rows.append(b)
        if not is_singular or not remove_constant:
            new_by_col.append(col)
    upright = utils.transpose_array(new_by_col)
    new_X = []
    for i, row in enumerate(upright):
        if i not in nan_rows:
            new_X.append(row)
    return new_X
Esempio n. 5
0
def check_cols(X):
    by_col = utils.transpose_array(X)
    msk = []
    for c in range(len(by_col)):
        col = by_col[c]
        if np.std(col) == 0:
            #print col
            #print '{} std_dev is 0'.format(c)
            msk.append(c)
    return msk
 def get_initial_thresholds(self, data):
     by_col = utils.transpose_array(data)
     thresholds = []
     start = 100
     for j in range(len(by_col)):
         col_thresholds = []
         feature_j = [float(i) for i in np.array(by_col[j])]
         values = list(set(feature_j))
         values.sort()
         col_thresholds.append(values[0] - .01)
         for i in range(1, len(values)):
             mid = (values[i] - values[i-1])/2
             col_thresholds.append(values[i-1] + mid)
         col_thresholds.append(values[-1] + .01)
         thresholds.append(col_thresholds)
     return thresholds
 def split_presence_array(self, X, column, threshold):
     array_l = []
     array_r = []
     by_col = utils.transpose_array(X)
     data = by_col[column]
     for i in range(len(data)):
         if self.presence_array[i] == 1:
             if data[i] > threshold:
                 array_l.append(0)
                 array_r.append(1)
             else:
                 array_l.append(1)
                 array_r.append(0)
         else:
             array_l.append(0)
             array_r.append(0)
     return array_l, array_r
    def choose_best_feature(self):
        by_col = utils.transpose_array(self.data_subset)
        max_info_gain = -1
        min_weighted_error = 1.5
        best_col = None
        col_threshold = None
        for j in range(len(by_col)):
            info_gain, threshold, weighted_error = self.compute_info_gain(by_col[j], self.truth_subset)
            #TODO - fix objective function so it is organized
            #if info_gain > max_info_gain:
            #    best_ig_col = j
            #    max_info_gain = info_gain
            #    col_threshold = threshold
            if weighted_error < min_weighted_error:
                best_col = j
                min_weighted_error = weighted_error
                max_info_gain = info_gain
                col_threshold = threshold
        if best_col is None:
            print "BEST COL is NONE"
            self.print_branch(False)

        return best_col, max_info_gain, col_threshold