Beispiel #1
0
# remove redundant education-number feature
x = delete(x, (4, 14), 1)

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_cond = lambda x: x == '?'
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5

# # drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print 'imputing with random replacement'
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print 'imputing with feature summarization (mode)'
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print 'imputing with one-hot'
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print 'imputing with predicted values from random forest'
clf = RandomForestClassifier(n_estimators=100, criterion='gini')
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)
Beispiel #2
0
                                              missing_data_symbol,
                                              monotone=monotone)

    miss_data_cols = feat_imp_ids.keys()
    print 'Missing data cols {}'.format(miss_data_cols)

    data_dict = {}
    data_dict['RawData'] = pert_data

    # drop observations with missing variables
    print 'imputing with drop'
    data_dict['Drop'] = imp.drop(pert_data, miss_data_cond)

    # replace missing values with random existing values
    print 'imputing with random replacement'
    data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond)

    # replace missing values with feature summary
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)

    # replace missing data with knn
Beispiel #3
0
# declare csv headers
x = np.genfromtxt("data/votes_train.csv", delimiter=",", dtype=object)  # use training set

# enumerate parameters and instantiate Imputer
imp = Imputer()
missing_data_cond = lambda x: x == "?"
cat_cols = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
n_neighbors = 3  # lower for votes

# drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print "imputing with random replacement"
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print "imputing with feature summarization (mode)"
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print "imputing with one-hot"
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print "imputing with predicted values from random forest"
clf = RandomForestClassifier(n_estimators=100, criterion="gini")
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)
Beispiel #4
0
                                              missing_data_symbol,
                                              monotone=monotone)

    miss_data_cols = feat_imp_ids.keys()
    print 'Missing data cols {}'.format(miss_data_cols)

    data_dict = {}
    data_dict['RawData'] = pert_data

    # drop observations with missing variables
    print 'imputing with drop'
    data_dict['Drop'] = imp.drop(pert_data, miss_data_cond)

    # replace missing values with random existing values
    print 'imputing with random replacement'
    data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond)

    # replace missing values with feature summary
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols,
                                            miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)