コード例 #1
0
def impute_missing_vals(df, num_cols, cat_cols):
    df_full = pd.DataFrame(columns=df.columns)
    for j in np.unique(df.ID):
        df_n = df[df.ID == j].reset_index(drop=True)
        missing_val = df_n.isnull().sum()
        r, c = df_n.shape
        if r > 2:
            if df_n[num_cols].isnull().sum().sum() > 0:
                df_n[num_cols] = pd.DataFrame(fancyimpute.KNN(k=5).complete(
                    df_n[num_cols]),
                                              columns=num_cols)


#         for i in num_cols:
#             if len(df_n[df_n[i].isnull()])>0:
#                 df_n.loc[df_n[i].isnull(),i]=np.mean(df_n[i])
        for i in cat_cols:
            if len(df_n[df_n[i].isnull()]) > 0:
                if len(stat._counts(df_n.loc[:, i])) > 1:
                    df_n.loc[:, i] = df_n.loc[:, i].fillna(method='ffill')
                    df_n[i] = df_n[i].astype(object)
                else:
                    df_n.loc[df_n[i].isnull(), i] = stat.mode(df_n[i])
        df_full = pd.concat([df_full, df_n], ignore_index=True)
    return df_full
コード例 #2
0
def outlier_imputer(df_o, num_cols):
    # Outlier Analysis
    while True:
        for i in num_cols:
            median = np.median(df_o[i])
            std = np.std(df_o[i])
            min = (df_o[i].quantile(0.25) - 1.5 *
                   (df_o[i].quantile(0.75) - df_o[i].quantile(0.25)))
            max = (df_o[i].quantile(0.75) + 1.5 *
                   (df_o[i].quantile(0.75) - df_o[i].quantile(0.25)))
            df_o.loc[df_o[i] < min, i] = np.nan
            df_o.loc[df_o[i] > max, i] = np.nan
        missing_val = df_o.isnull().sum()
        if (missing_val.sum() > 0):
            df_o[num_cols] = pd.DataFrame(fancyimpute.KNN(k=3).complete(
                df_o[num_cols]),
                                          columns=num_cols)


#             for i in num_cols:
#                 if len(df_o[df_o[i].isnull()])>0:
#                     df_o.loc[df_o[i].isnull(),i]=np.mean(df_o[i])
        else:
            break
    df_o.loc[df_o['Reason for absence'] == 0, 'Reason for absence'] = 20
    df_o.loc[df_o['Month of absence'] == 0,
             'Month of absence'] = stat.mode(df_o['Month of absence'])
    return df_o
コード例 #3
0
def perform_knn_imputation(dfs):
    knn_imputed_datasets = [
        fancyimpute.KNN(k=100, verbose=True).fit_transform(dfs[i])
        for i in range(len(dfs))
    ]
    return [
        pd.DataFrame(data=knn_imputed_datasets[i]) for i in range(len(dfs))
    ]
コード例 #4
0
def knn_fill_conf(perc, c):
    """knn fill that provides the conf intervals"""
    clf = BayesianRidge()
    df, y = glm_testing.create_missing(perc=perc, c=c)
    drug_vals, drug_true = glm_testing.test_drug(c=c)
    design = fancyimpute.KNN(k=3).fit_transform(df)
    clf.fit(design, y)
    drug_preds, std = clf.predict(drug_vals, return_std=True)
    return sum(scipy.stats.norm(drug_preds, std * 1).pdf(drug_true) < 0.05)
コード例 #5
0
ファイル: missvalueinjector.py プロジェクト: tadeze/ADMV
 def impute_value(self, df, method="MICE"):
     """
     Impute using MICE
     """
     if method == "MICE":
         return fi.MICE(verbose=False).complete(df)
     elif method == "KNN":
         return fi.KNN(k=4, verbose=False).complete(df)
     else:
         return fi.SimpleFill().complete(df)
コード例 #6
0
    def score_rent(self):
        X_train, X_test, y_train, y_test, contin_cols = self.preprocessing()
        #seperate continous and categorical columns

        con_train = X_train[contin_cols]
        cat_train = X_train[X_train.columns.difference(contin_cols)]

        #fancyimpute mice in continous train data
        mice = fancyimpute.MICE(verbose=0)
        con_train = np.asarray(con_train)
        con_train_mice = mice.complete(con_train)

        #fancyimpute mice in categorical train data
        cat_train = np.asarray(cat_train)
        cat_train_fancyknn = fancyimpute.KNN().complete(cat_train)
        cat_train_fancyknn = np.round(cat_train_fancyknn).astype(int)

        #apply boxcox transformation to continuous train data
        con_train_mice_bc = np.empty(con_train_mice.shape)
        from scipy import stats
        for i in range(len(contin_cols)):
            if np.argwhere(con_train_mice[:, i] < 0).size == 0:
                x = stats.boxcox(con_train_mice[:, i] + 1e-5)[0]
                x = np.asarray([x])
                con_train_mice_bc[:, i] = x
            else:
                con_train_mice_bc[:, i] = con_train_mice[:, i]

        # apply onehot to categorical train data
        enc = OneHotEncoder()
        enc = enc.fit(cat_train_fancyknn)
        oh = enc.transform(cat_train_fancyknn).toarray()
        cat_train_fancyknn_onehot = np.round(oh).astype(int)

        #concatenate imputed train data
        X_train_imp = np.concatenate(
            (cat_train_fancyknn_onehot, con_train_mice_bc), axis=1)

        # Feature selection using Lasso
        select_lassocv = SelectFromModel(LassoCV())
        select_lassocv = select_lassocv.fit(X_train_imp, y_train)

        #LassoCV
        param_grid = {'alpha': np.logspace(-3, 0, 14)}
        print(param_grid)
        grid = GridSearchCV(Lasso(normalize=True, max_iter=1e6),
                            param_grid,
                            cv=10)

        #makepipeline to prevent information leakage
        pipe_lassocv = make_pipeline(MinMaxScaler(), select_lassocv, grid)
        pipe_lassocv = pipe_lassocv.fit(X_train_imp, y_train)
        train_r2 = np.mean(
            cross_val_score(pipe_lassocv, X_train_imp, y_train, cv=5))
        return contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test
コード例 #7
0
def impute_parameter_adjustment(method, param_grid, impute_radio, x_init,
                                y_init, reference_x, reference_y):
    model = joblib.load('..\\models\\vote_model_hard.joblib')
    markers = ['o', '*', '1', 's', '2']
    I = 20
    for radio, marker in zip(impute_radio, markers):
        acc_1 = {i: 0 for i in param_grid}
        acc_2 = {i: 0 for i in param_grid}
        for m in range(I):
            corruptor = Corruptor(x_init, radio)
            x_miss = getattr(corruptor, "mcar")()
            for n in param_grid:
                if method == 'knn':
                    x_impute = fancyimpute.KNN(k=n).fit_transform(
                        np.vstack(
                            (x_miss, reference_x)))[range(x_init.shape[0])]
                if method == 'mice':
                    data_impute_list = []
                    for i in range(n):
                        imputer = fancyimpute.IterativeImputer(
                            n_iter=13, sample_posterior=True, random_state=i)
                        data_impute_list.append(
                            imputer.fit_transform(
                                np.vstack(
                                    (x_miss,
                                     reference_x)))[range(x_init.shape[0])])
                    x_impute = np.mean(data_impute_list, 0)
                    print(radio, m, n)
                if method == 'em':
                    x_impute = em(np.vstack((x_miss, reference_x)),
                                  loops=n)[range(x_init.shape[0])]
                if method == 'som':
                    x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])]
                y_pred1 = model.predict(x_impute)
                y_pred2 = model.predict(x_init)
                acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2)
                acc_2[n] += 1 - accuracy_score(y_pred1, y_init)
        acc_1 = {i: (j / I) for i, j in acc_1.items()}
        acc_2 = {i: (j / I) for i, j in acc_2.items()}
        plt.subplot(121)
        plt.plot(acc_1.keys(),
                 acc_1.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and prediction')
        plt.subplot(122)
        plt.plot(acc_2.keys(),
                 acc_2.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and real label')
        plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5)
    plt.show()
コード例 #8
0
def impute(data, method='knn', n=5):
    if method == 'knn':
        data_impute = fancyimpute.KNN(k=n).fit_transform(data)
    if method == 'mice':
        data_impute_list = []
        for i in range(11):
            imputer = fancyimpute.IterativeImputer(n_iter=13,
                                                   sample_posterior=True,
                                                   random_state=i)
            data_impute_list.append(imputer.fit_transform(data))
        data_impute = np.mean(data_impute_list, 0)
        # data_impute = mice(data)
    if method == 'em':
        data_impute = em(data)
    if method == 'mean':
        data_impute = fancyimpute.simple_fill.SimpleFill(
            fill_method='mean').fit_transform(data)
    return data_impute
コード例 #9
0
def outlier_imputer(df_o, num_cols):
    # Outlier Analysis
    while True:
        for i in num_cols:
            min = (df_o[i].quantile(0.25) - 1.5 *
                   (df_o[i].quantile(0.75) - df_o[i].quantile(0.25)))
            max = (df_o[i].quantile(0.75) + 1.5 *
                   (df_o[i].quantile(0.75) - df_o[i].quantile(0.25)))
            df_o.loc[df_o[i] < min, i] = np.nan
            df_o.loc[df_o[i] > max, i] = np.nan
        missing_val = df_o.isnull().sum()
        print(missing_val)
        if (missing_val.sum() > 0):
            df_o_knn = pd.DataFrame(fancyimpute.KNN(k=3).complete(
                df_o[num_cols]),
                                    columns=num_cols)
            df_o_knn.head()
            df_o.iloc[:, 9:15] = df_o_knn.iloc[:, :]
        else:
            break
    return df_o
コード例 #10
0
    def predict_rent(self):
        contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test = self.score_rent(
        )
        con_test = X_test[contin_cols]
        cat_test = X_test[X_test.columns.difference(contin_cols)]

        #impute test data respectively (continous data)
        mice = fancyimpute.MICE(verbose=0)
        con_test = np.asarray(con_test)
        con_test_mice = mice.complete(con_test)

        #categorical data
        cat_test = np.asarray(cat_test)
        cat_test_fancyknn = fancyimpute.KNN().complete(cat_test)
        cat_test_fancyknn = np.round(cat_test_fancyknn).astype(int)

        #apply boxcox transformation to continuous test data
        con_test_mice_bc = np.empty(con_test_mice.shape)
        from scipy import stats
        for i in range(len(contin_cols)):
            if np.argwhere(con_test_mice[:, i] < 0).size == 0:
                x = stats.boxcox(con_test_mice[:, i] + 1e-5)[0]
                x = np.asarray([x])
                con_test_mice_bc[:, i] = x
            else:
                con_test_mice_bc[:, i] = con_test_mice[:, i]

        # apply onehot to categorical train data
        oh = enc.transform(cat_test_fancyknn).toarray()
        cat_test_fancyknn_onehot = np.round(oh).astype(int)
        print("Finished onehot")
        #concatenate imputed test data
        X_test_imp = np.concatenate(
            (cat_test_fancyknn_onehot, con_test_mice_bc), axis=1)

        # make prediction based on training model
        y_pred = pipe_lassocv.predict(X_test_imp)
        test_r2 = r2_score(y_test, y_pred)
        print(test_r2)
        return X_test, y_test, y_pred
コード例 #11
0
            missing_matrix[i] = np.nan
        complete_matrix = method(verbose=False).fit_transform(missing_matrix)
        imputed = [complete_matrix[i] for i in indices]
        correlations[method].append(
            pd.DataFrame([imputed, originals]).T.corr().iloc[0, 1])
        deviation = np.mean([(abs((o - i) / o))
                             for o, i in zip(originals, imputed)
                             if o == o and o > .01])
        percent_off[method].append(deviation)

# try different K values for KNN
for k in range(4, 15):
    print('using k=%s' % k)
    key = 'KNN:K=%s' % k
    correlations[key] = []
    percent_off[key] = []
    for simulation in range(100):
        indices = get_rand_index(base_matrix, 5000)
        originals = [base_matrix[i] for i in indices]
        missing_matrix = base_matrix.copy()
        for i in indices:
            missing_matrix[i] = np.nan
        complete_matrix = fancyimpute.KNN(
            k=k, verbose=False).fit_transform(missing_matrix)
        imputed = [complete_matrix[i] for i in indices]
        correlations[key].append(
            pd.DataFrame([imputed, originals]).T.corr().iloc[0, 1])
        deviation = np.mean([(abs((o - i) / o))
                             for o, i in zip(originals, imputed)
                             if o == o and o > .01])
        percent_off[key].append(deviation)
コード例 #12
0
    colnamesX = ['PTS_x', 'DRPG_x', 'TOPG_x', 'PF_x', 'RPG_x']
    #colnamesX = ['APG_y', 'RPG_y','PPG_y', 'FG%_y',  'STPG_y', 'BLKPG_y', 'GP_y', 'MPG_y', '2P%_y', '3P%_y']
    #colnamesX = ['PTS_x','PF_x','TOPG_x']
    colnamesY = ['Class']  # class label
    allCols = colnamesX + colnamesY
    # read in data
    bigDF = shuffle(pd.read_csv(
        args.path_to_dataframe))  # shuffle data as we read it in

    # dataX.to_csv("./csv/data_x.csv")
    # dataY.to_csv("./csv/data_y.csv")

    focusDF = bigDF[allCols].dropna(thresh=2)
    dataY = focusDF[colnamesY]  # labels
    dataX = focusDF[colnamesX]
    dataX = fi.KNN(k=3).fit_transform(dataX)
    print("Feature vector table shape:", dataX.shape)
    print("Label table shape:", dataY.shape)

    # dividing X, y into train and test data
    X_train, X_test, y_train, y_test = train_test_split(dataX,
                                                        dataY.values.ravel(),
                                                        test_size=0.25)
    scalerX = MinMaxScaler()
    newDataX = scalerX.fit(X_train)
    # newDataX = dataX

    # Apply the scaler to the X training data
    X_train_std = scalerX.transform(X_train)

    # # Apply the SAME scaler to the X test data
コード例 #13
0
	def impute(self):
		return fi.KNN(verbose=False).complete(self.missing_data)
コード例 #14
0
    csv_reader = csv.reader(file_csv_input)
    list_movie_id = list(csv_reader)
    list_movie_id = [int(each[0]) for each in list_movie_id]

np_data = np.zeros((number_of_user, number_of_movie))
np_data.fill(np.nan)
for each in list_data:
    user_id = int(each[0]) - 1
    movie_id = list_movie_id.index(int(each[1]))
    rating = float(each[2])
    np_data[user_id, movie_id] = rating

print(np_data)

time_start = time.time()
model = fancyimpute.KNN(K_of_knn + 1)
np_prediction = model.fit_transform(np_data)

error = []
for each in list_test:
    predict_user_id = int(each[0]) - 1
    predict_movie_id = list_movie_id.index(int(each[1]))
    real_rating = float(each[2])

    predict_rating = np_prediction[predict_user_id][predict_movie_id]

    error.append(abs(real_rating - predict_rating))

time_end = time.time()
print('Finish, used %.3lfs' % (time_end - time_start))
コード例 #15
0
ファイル: baseline_methods.py プロジェクト: bailingnan/BRITS
print('Mean imputation:')
print(get_loss(X_c, X_mean, Y_c))

# save mean inputation results
print(X_c.shape, Y_c.shape, Z_c.shape)
# raw_input()
np.save('./result/mean_data.npy', X_mean)
np.save('./result/mean_label.npy', Z_c)

# Algo2: KNN imputation

X_knn = []

for x, y in zip(X, Y):
    X_knn.append(fancyimpute.KNN(k=10, verbose=False).fit_transform(x))

X_c = np.concatenate(X, axis=0)
Y_c = np.concatenate(Y, axis=0)
X_knn = np.concatenate(X_knn, axis=0)

print('KNN imputation')
print(get_loss(X_c, X_knn, Y_c))

# raw_input()

# ### Matrix Factorization
# since MF is extremely slow, we evaluate the imputation result every 100 iterations

X_mf = []
コード例 #16
0
def preprocess(trainfile,
               testfile,
               outputdir,
               useless_attr,
               miss_threshold,
               xstrategy,
               ymin,
               ymax,
               ystrategy,
               fill_method="MICE",
               normal01=True):
    """对XY进行数据预处理,矩阵补全、正则化标准化等。

    :param trainfile: string, 训练集(d_train_20180102.csv)的路径
    :param testfile: string, 测试集(d_test_A_20180102.csv)的路径
    :param outputdir: string, 预处理后文件保存的路径
    :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3]
    :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7
    :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"}
    :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点
    :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点
    :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing")
    :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"}
    :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化
    :return: list, 归一化之后的trainX, trainY, testX
    """
    # 0. 读入训练集,测试集
    train_XY = convert(trainfile)
    test_X = convert(testfile)
    print("读入数据集,开始数据预处理")

    # 1. 删除无用属性列
    train_id = train_XY[:, 0:1]
    test_id = test_X[:, 0:1]
    train_XY = np.delete(train_XY, useless_attr, axis=1)
    test_X = np.delete(test_X, useless_attr, axis=1)
    n_test = test_X.shape[0]
    info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\
          %(str(useless_attr), str(train_XY.shape), str(test_X.shape))
    print(info1)

    # 2. 删除缺失严重的列
    miss_mask = np.isnan(train_XY)
    n = miss_mask.shape[0]
    column_del = []  # 删除列的list
    for i in range(miss_mask.shape[1]):
        miss_n = miss_mask[:, i].sum()
        if miss_n / n >= miss_threshold:
            column_del.append(i)
    train_XY = np.delete(train_XY, column_del, axis=1)
    test_X = np.delete(test_X, column_del, axis=1)
    info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100,
                                                       str(column_del))
    print(info2)

    # 3. 对y进行去噪,手动设置阈值
    train_Y = train_XY[:, -1:]
    upper_mask = train_Y > ymax
    lower_mask = train_Y < ymin
    if ystrategy == "replace":
        train_Y[upper_mask] = ymax
        train_Y[lower_mask] = ymin
    elif ystrategy == "delete":
        index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T
        chsn_mask = upper_mask | lower_mask
        train_XY = np.delete(train_XY, index[chsn_mask], axis=0)
        train_id = np.delete(train_id, index[chsn_mask], axis=0)
    elif ystrategy == "nothing":
        pass
    else:
        raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个")
    train_Y = train_XY[:, -1:]
    print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape))

    # 4. 对X进行操作,通过boxplot计算阈值
    train_X = train_XY[:, :-1]
    all_X = np.concatenate([train_X, test_X], axis=0)
    attr_n = train_XY.shape[1] - 1
    attr_min_max = np.zeros(
        (attr_n, 2), dtype=np.float64)  # 存储每个属性经过boxplot之后的最小最大值,即阈值array
    if xstrategy == "nothing":
        pass
    elif xstrategy == "replace":
        # 对X中的奇异点 替换为 最值
        for i in range(attr_n):
            # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作
            crt_attr = all_X[:, i:i + 1]
            miss = np.isnan(crt_attr)
            box_dic = plt.boxplot(crt_attr[~miss])
            crt_max = box_dic["caps"][0].get_ydata()[0]
            crt_min = box_dic["caps"][1].get_ydata()[0]
            if crt_max < crt_min:
                tmp = crt_max
                crt_max = crt_min
                crt_min = tmp
            attr_min_max[i, 0] = crt_min
            attr_min_max[i, 1] = crt_max
            crt_attr[miss] = 0
            upper_mask = crt_attr > crt_max
            lower_mask = crt_attr < crt_min
            upper_mask &= ~miss
            lower_mask &= ~miss

            crt_attr[upper_mask] = crt_max
            crt_attr[lower_mask] = crt_min
            crt_attr[miss] = np.nan
    else:
        raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个")
    print(r"4. 对所有的X进行去噪(%s)." % xstrategy)

    # 5. 矩阵补全
    completer = None
    if fill_method == "KNN":
        completer = fi.KNN(verbose=False)
    elif fill_method == "SoftI":
        completer = fi.SoftImpute(verbose=False)
    elif fill_method == "MF":
        completer = fi.MatrixFactorization(verbose=False)
    elif fill_method == "MICE":
        completer = fi.MICE(verbose=False)
    else:
        ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.")
    all_X_complete = completer.complete(all_X)
    print("5. 在all_X上进行矩阵补全(%s)." % fill_method)

    # train_X = all_X_complete[:-1000, :]
    # test_X = all_X_complete[-1000:, :]
    # 6. 归一化,以及01缩放
    if normal01:
        X_nmler = StandardScaler()
        X_01 = MinMaxScaler()
        Y_nmler = StandardScaler()
        Y_01 = MinMaxScaler()

        X_nmler.fit(all_X_complete)
        Y_nmler.fit(train_Y)
        all_X_nml = X_nmler.transform(all_X_complete)
        train_Y_nml = Y_nmler.transform(train_Y)
        X_01.fit(all_X_nml)
        Y_01.fit(train_Y_nml)
        all_X_nml01 = X_01.transform(all_X_nml)
        train_Y_nml01 = Y_01.transform(train_Y_nml)
        final_train_X = all_X_nml01[:-n_test, :]
        final_test_X = all_X_nml01[-n_test:, :]
        final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1)
    else:
        final_train_X = all_X_complete[:-n_test, :]
        final_test_X = all_X_complete[-n_test:, :]
        final_train_Y = train_Y
    print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01)

    # 7. 存储数据
    print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir)
    # timestamp = datetime.now().strftime("%Y%m%d%H%M")
    timestamp = "0000"
    np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv",
               final_train_X,
               delimiter=",")
    np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv",
               final_test_X,
               delimiter=",")
    np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv",
               final_train_Y,
               delimiter=",")
    np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv",
               train_id.astype(np.int64),
               delimiter=",")
    np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv",
               test_id.astype(np.int64),
               delimiter=",")
    return train_X, train_Y, test_X, train_id
コード例 #17
0
    def load_and_process_data(self, data_path):
        df = pd.read_csv(data_path, sep=';')
        df = df[df['Month of absence'] != 0]
        self.preprocessed_data = df.copy()
        cat_cols = [
            'ID', 'Reason for absence', 'Month of absence', 'Day of the week',
            'Seasons', 'Education', 'Son', 'Pet'
        ]
        num_cols = [
            'Transportation expense', 'Distance from Residence to Work',
            'Service time', 'Age', 'Work load Average/day ', 'Hit target',
            'Disciplinary failure', 'Social drinker', 'Social smoker',
            'Weight', 'Height', 'Body mass index', 'Absenteeism time in hours'
        ]
        df.loc[df['Reason for absence'].isin(range(1, 15)),
               'Reason for absence'] = 1
        df.loc[df['Reason for absence'].isin(range(15, 19)),
               'Reason for absence'] = 2
        df.loc[df['Reason for absence'].isin(range(19, 22)),
               'Reason for absence'] = 3
        df.loc[df['Reason for absence'].isin(range(22, 29)),
               'Reason for absence'] = 4
        df.Education = df.Education.map({1: 0, 2: 1, 3: 1, 4: 1})
        df.Pet = df.Pet.map({0: 0, 1: 1, 2: 1, 4: 1, 5: 1, 8: 1})
        df.Son = df.Son.map({0: 0, 1: 0, 2: 1, 3: 1, 4: 1})
        while True:
            for i in num_cols:
                median = np.median(df[i])
                std = np.std(df[i])
                min = (df[i].quantile(0.25) - 1.5 *
                       (df[i].quantile(0.75) - df[i].quantile(0.25)))
                max = (df[i].quantile(0.75) + 1.5 *
                       (df[i].quantile(0.75) - df[i].quantile(0.25)))
                df.loc[df[i] < min, i] = np.nan
                df.loc[df[i] > max, i] = np.nan
            missing_val = df.isnull().sum()
            if (missing_val.sum() > 0):
                df[num_cols] = pd.DataFrame(fancyimpute.KNN(k=3).complete(
                    df[num_cols]),
                                            columns=num_cols)

    #             for i in num_cols:
    #                 if len(df_o[df_o[i].isnull()])>0:
    #                     df_o.loc[df_o[i].isnull(),i]=np.mean(df_o[i])
            else:
                break
        df.drop([
            'ID', 'Weight', 'Age', 'Social smoker', 'Disciplinary failure',
            'Education', 'Pet', 'Absenteeism time in hours'
        ],
                axis=1,
                inplace=True)
        df = pd.get_dummies(df,
                            columns=['Reason for absence'],
                            drop_first=True)
        df = df[[
            'Reason for absence_1', 'Reason for absence_2',
            'Reason for absence_3', 'Reason for absence_4', 'Month of absence',
            'Day of the week', 'Seasons', 'Transportation expense',
            'Distance from Residence to Work', 'Service time',
            'Work load Average/day ', 'Hit target', 'Son', 'Social drinker',
            'Height', 'Body mass index'
        ]]
        num_cols = [
            'Transportation expense', 'Distance from Residence to Work',
            'Service time', 'Work load Average/day ', 'Hit target', 'Height',
            'Body mass index'
        ]
        self.data = df
        self.data[num_cols] = self.scaler.transform(self.data[num_cols])
        print(self.data[num_cols].head())
コード例 #18
0
rows = all_matches[pd.isnull(all_matches[betting_odds_combined]).any(axis=1)]
print(len(rows))

for index, row in rows.iterrows():
    if index % 1000 == 0:
        print(index)
    for betting_odds in betting_odds_all:
        mean = np.mean(row[betting_odds])
        all_matches.loc[index, row[betting_odds].
                        index[row[betting_odds].isnull().tolist()]] = mean

home = all_matches.filter(regex='__home_')
away = all_matches.filter(regex='__away_')

home_filled = fi.KNN().fit_transform(home)
home_filled = pd.DataFrame(data=home_filled,
                           columns=home.columns,
                           index=home.index)

away_filled = fi.KNN().fit_transform(away)
away_filled = pd.DataFrame(data=away_filled,
                           columns=away.columns,
                           index=away.index)

all_matches_filled = all_matches.copy()
all_matches_filled[home.columns] = home_filled
all_matches_filled[away.columns] = away_filled

cols_to_normalize = [
    '__home_buildUpPlaySpeed', '__home_buildUpPlayDribbling',
コード例 #19
0
mecanismo="MAR"
robjects.r.assign('meca',mecanismo)
#robjects.r('print(meca)')
proporción=0.4
robjects.r.assign('propor',proporción)
robjects.r(""" 
A=matrix(nrow=52,ncol=2, data,byrow=TRUE)
""")
robjects.r(""" 
require(mice)
result <- ampute(data=A, prop = propor,mech=meca, bycases = FALSE)
""")
y = robjects.r('result$amp')
#print('el valor de y:',y)
print(y[0])
archivo=rd.lista_imputada(y)
print(archivo)
wt.write(archivo,'datos_amput.csv')
#robjects.r('print(propor)')
#robjects.r('print(x)')
vecinos=3
var=fy.KNN(k=vecinos).fit_transform(archivo)
var2=my.KNNImputer(n_neighbors=vecinos).fit_transform(archivo)
print(len(var))
archivo_imputado=rd.reconvertir(Archivo[0],var)
archivo_imputado2=rd.reconvertir(Archivo[0],var2)
print(archivo_imputado)
wt.write(archivo_imputado,'datos_imputados.csv')
wt.write(archivo_imputado2,'datos_imputados2.csv')
#print(pi[0])
コード例 #20
0
accepts_x.head()

# In[5]:

accepts_y = accepts['bad_ind']

# In[ ]:

rejects.head()

# In[6]:

rejects_x = rejects[[
    "tot_derog", "age_oldest_tr", "rev_util", "fico_score", "ltv"
]]

# print rejects_x.head()
rejects_x.info()

# In[ ]:

accepts_x.info()
import fancyimpute as fimp

accepts_x_filled = pd.DataFrame(fimp.KNN(3).complete(accepts_x.as_matrix()))

accepts_x_filled.columns = accepts_x.columns

rejects_x_filled = pd.DataFrame(fimp.KNN(3).complete(rejects_x.as_matrix()))

rejects_x_filled.columns = rejects_x.columns
コード例 #21
0
from selfregulation.utils.utils import get_behav_data, get_recent_dataset, get_demographics
from selfregulation.utils.result_utils import load_results
from selfregulation.utils.get_balanced_folds import BalancedKFold

# load data

dataset = get_recent_dataset()
items = get_behav_data(dataset=dataset, file='items.csv.gz')
subject_items = get_behav_data(dataset=dataset, file='subject_x_items.csv')
survey_ontology = load_results(dataset)['survey']
demographics = survey_ontology.DA.data
demo_factors = survey_ontology.DA.get_scores()

# set up prediction
imputer = fancyimpute.KNN()
predictors = imputer.fit_transform(subject_items)
targets = demo_factors.values

# set up cross-validation
for i, name in enumerate(demo_factors.columns):
    CV = BalancedKFold(nfolds=10)
    CV_iter = list(CV.split(predictors, targets[:, 0]))
    clf = RidgeCV(cv=5)
    score = cross_val_score(clf,
                            survey_ontology.EFA.get_scores(),
                            targets[:, i],
                            cv=CV_iter,
                            scoring=make_scorer(r2_score)).mean()
    print('%s Score: %.2f' % (name, score))
コード例 #22
0
def fancy_impute(df, method='mice'):
    if method =='knn':
        df = pd.DataFrame(data=fancyimpute.KNN(3).complete(df), columns=df.columns, index=df.index)
    else:
        df = pd.DataFrame(data=fancyimpute.MICE().complete(df), columns=df.columns, index=df.index)
    return df
コード例 #23
0
def impute(df, method, verbose=False):
    """
    Impute missing data using specified imputation method.
    
    Parameters
    ----------
    df: pd.DataFrame
        Stat DataFrame with source columns and player/team  multi-index.
    method: str/bool
        Imputation method for missing data.
            - False: Do not impute missing data.
            - None: Do not impute missing data.
            - 'BiScaler'
            - 'IterativeImpute'
            - 'IterativeSVD'
            - 'KNN': Impute with nearest neighbors.
            - 'Mean': Impute missing with average of other sources.
            - 'NuclearNorm'
            - 'SoftImpute'
    verbose: bool, default=False
        If True, print debugging information.
        
    Returns
    -------
    df: pd.DataFrame
        Imputed DataFrame with no NaNs.
    """
    warnings.filterwarnings('ignore', category=RuntimeWarning)

    # Subset DataFrame to only include only projection columns.
    ignored_cols = ['Player', 'Team', 'Pos', 'Week', 'STATS']
    impute_cols = [col for col in list(df) if col not in ignored_cols]
    X = df[impute_cols].copy().T

    # Impute DataFrame.
    v = verbose
    if method in [None, False]:
        imputed_vals = X.values
    elif np.sum(np.sum(X.isnull())) == 0:
        # No missing values.
        imputed_vals = X.values
    elif method == 'BiScaler':
        imputed_vals = fi.BiScaler(verbose=v).fit_transform(X)
    elif method == 'IterativeImpute':
        imputed_vals = fi.IterativeImputer(verbose=v).fit_transform(X)
    elif method == 'IterativeSVD':
        imputed_vals = fi.IterativeSVD(verbose=v).fit_transform(X)
    elif method == 'KNN':
        imputed_vals = fi.KNN(k=3, verbose=v).fit_transform(X)
    elif method == 'MatrixFactorization':
        imputed_vals = fi.MatrixFactorization(verbose=v).fit_transform(X)
    elif method == 'Mean':
        imputed_vals = fi.SimpleFill('mean').fit_transform(X)
    elif method == 'Median':
        imputed_vals = fi.SimpleFill('median').fit_transform(X)
    elif method == 'NuclearNorm':
        imputed_vals = fi.NuclearNormMinimization(verbose=v).fit_transform(X)
    elif method == 'SoftImpute':
        imputed_vals = fi.SoftImpute(verbose=v).fit_transform(X)

    # Recombine ignored columns with imputed data.
    imputed_df = pd.DataFrame(imputed_vals.T, columns=X.index)
    for col in impute_cols:
        if len(imputed_df[col]) != len(df[col]):
            print(f'df: {len(df[col])}\nimp: {len(imputed_df[col])}')
        df[col] = imputed_df[col].values

    return df
コード例 #24
0
print('Mean imputation:')
print(get_loss(X_c, X_mean, Y_c))

# save mean inputation results
print(X_c.shape, Y_c.shape, Z_c.shape)
raw_input()
np.save('./result/mean_data.npy', X_mean)
np.save('./result/mean_label.npy', Z_c)

# Algo2: KNN imputation

X_knn = []

for x, y in zip(X, Y):
    X_knn.append(fancyimpute.KNN(k=10, verbose=False).complete(x))

X_c = np.concatenate(X, axis=0)
Y_c = np.concatenate(Y, axis=0)
X_knn = np.concatenate(X_knn, axis=0)

print('KNN imputation')
print(get_loss(X_c, X_knn, Y_c))

raw_input()


# ### Matrix Factorization
# since MF is extremely slow, we evaluate the imputation result every 100 iterations

X_mf = []