Beispiel #1
0
print 'imputing with predicted values usng logistic regression'
clf = LogisticRegression(penalty='l2',
                         dual=False,
                         tol=0.0001,
                         C=1.0,
                         fit_intercept=True,
                         intercept_scaling=1)
data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with values obtained after factor analysis
print 'imputing with factor analysis'
data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond)

# replace missing data with knn
print 'imputing with K-Nearest Neighbors'
data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols)


def compute_histogram(data, labels):
    histogram = itemfreq(sorted(data))
    for label in labels:
        if label not in histogram[:, 0]:
            histogram = np.vstack(
                (histogram, np.array([[label, 0]], dtype=object)))
    histogram = histogram[histogram[:, 0].argsort()]
    return histogram


# compute histograms
labels = np.unique(x[:, 1])
freq_data = {}
Beispiel #2
0
    # replace missing values with feature summary
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols, miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)

    # replace missing data with knn
    print 'imputing with K-Nearest Neighbors'
    data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond,
                               cat_cols)

    conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN']
    methods = ['RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest',
               'PCA', 'KNN']

    color_mapping = {}
    for i in xrange(len(methods)):
        color_mapping[methods[i]] = (i+1) / float(len(methods))


    ###########################
    # plot confusion matrices #
    ###########################
    fig, axes = plt.subplots(len(miss_data_cols), len(conf_methods),
                             figsize=(8, 8))
Beispiel #3
0
    max_iter=1000,
)
data_svm = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with predictions using logistic regression
print "imputing with predicted values usng logistic regression"
clf = LogisticRegression(penalty="l2", dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1)
data_logistic = imp.predict(x, cat_cols, missing_data_cond, clf)

# replace missing data with values obtained after factor analysis
print "imputing with factor analysis"
data_facanal = imp.factor_analysis(x, cat_cols, missing_data_cond)

# replace missing data with knn
print "imputing with K-Nearest Neighbors"
data_knn = imp.knn(x, n_neighbors, np.mean, missing_data_cond, cat_cols)


def compute_histogram(data, labels):
    histogram = itemfreq(sorted(data))
    for label in labels:
        if label not in histogram[:, 0]:
            histogram = np.vstack((histogram, np.array([[label, 0]], dtype=object)))
    histogram = histogram[histogram[:, 0].argsort()]
    return histogram


# compute histograms
labels = np.unique(x[:, 1])
freq_data = {}
freq_data["Raw data"] = compute_histogram(x[:, 1], labels)
Beispiel #4
0
    print 'imputing with feature summarization (mode)'
    summ_func = lambda x: mode(x)[0]
    data_dict['Mode'] = imp.summarize(pert_data, summ_func, miss_data_cond)

    # replace missing data with predictions using random forest
    print 'imputing with Random Forest'
    data_dict['RandomForest'] = imp.predict(pert_data, cat_cols,
                                            miss_data_cond)

    # replace missing data with values obtained after factor analysis
    print 'imputing with PCA'
    data_dict['PCA'] = imp.factor_analysis(pert_data, cat_cols, miss_data_cond)

    # replace missing data with knn
    print 'imputing with K-Nearest Neighbors'
    data_dict['KNN'] = imp.knn(pert_data, n_neighbors, np.mean, miss_data_cond,
                               cat_cols)

    conf_methods = ['RandomReplace', 'Mode', 'RandomForest', 'PCA', 'KNN']
    methods = [
        'RawData', 'Drop', 'RandomReplace', 'Mode', 'RandomForest', 'PCA',
        'KNN'
    ]

    color_mapping = {}
    for i in xrange(len(methods)):
        color_mapping[methods[i]] = (i + 1) / float(len(methods))

    ###########################
    # plot confusion matrices #
    ###########################
    fig, axes = plt.subplots(len(miss_data_cols),