Beispiel #1
0
missing_data_cond = lambda x: x == '?'
cat_cols = (1, 3, 4, 5, 6, 7, 8, 12)
n_neighbors = 5

# # drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print 'imputing with random replacement'
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print 'imputing with feature summarization (mode)'
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print 'imputing with one-hot'
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print 'imputing with predicted values from random forest'
clf = RandomForestClassifier(n_estimators=100, criterion='gini')
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using SVM
print 'imputing with predicted values usng SVM'
clf = SVM(penalty='l2',
          loss='squared_hinge',
          dual=True,
Beispiel #2
0
    data_dict = {}
    data_dict['RawData'] = pert_data

    # drop observations with missing variables
    print 'imputing with drop'
    data_dict['Drop'] = imp.drop(pert_data, miss_data_cond)

    # replace missing values with random existing values
    print 'imputing with random replacement'
    data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond)

    # replace missing values with feature summary
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)

    # replace missing data with knn
    print 'imputing with K-Nearest Neighbors'
    data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond,
                               cat_cols)

    conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN']
Beispiel #3
0
missing_data_cond = lambda x: x == "?"
cat_cols = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
n_neighbors = 3  # lower for votes

# drop observations with missing variables
# print 'imputing with drop'
# data_drop = imp.drop(x, missing_data_cond)

# replace missing values with random existing values
print "imputing with random replacement"
data_replace = imp.replace(x, missing_data_cond)

# replace missing values with feature summary
print "imputing with feature summarization (mode)"
summ_func = lambda x: mode(x)[0]
data_mode = imp.summarize(x, summ_func, missing_data_cond)

# replace categorical features with one hot row
print "imputing with one-hot"
data_onehot = imp.binarize_data(x, cat_cols)

# replace missing data with predictions using random forest
print "imputing with predicted values from random forest"
clf = RandomForestClassifier(n_estimators=100, criterion="gini")
data_rf = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using SVM
print "imputing with predicted values usng SVM"
clf = clf = SVM(
    penalty="l2",
    loss="squared_hinge",
Beispiel #4
0
    data_dict = {}
    data_dict['RawData'] = pert_data

    # drop observations with missing variables
    print 'imputing with drop'
    data_dict['Drop'] = imp.drop(pert_data, miss_data_cond)

    # replace missing values with random existing values
    print 'imputing with random replacement'
    data_dict['RandomReplace'] = imp.replace(pert_data, miss_data_cond)

    # replace missing values with feature summary
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols,
                                            miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)

    # replace missing data with knn
    print 'imputing with K-Nearest Neighbors'
    data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond,
                               cat_cols)