def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"): """ Return randomly generated dataset of numbers with uniformly distributed values between bound[0] and bound[1] Parameters ---------- bound:tuple (start,stop) Determines the range of values in the matrix. Index 0 for start value and index 1 for stop value. Start is inclusive, stop is exclusive. shape:tuple(optional) Size of the randomly generated data missingness: ('mcar', 'mar', 'mnar') Type of missigness you want in your dataset th: float between [0,1] Percentage of missing data in generated data dtype: ('int','float') Type of data Returns ------- numpy.ndarray """ if dtype == "int": data = np.random.randint(bound[0], bound[1], size=shape).astype(float) elif dtype == "float": data = np.random.uniform(bound[0], bound[1], size=shape) corruptor = Corruptor(data, thr=thr) raw_data = getattr(corruptor, missingness)() return raw_data
def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"): """ Return randomly generated dataset of numbers with normally distributed values with given and sigma. Parameters ---------- theta: tuple (mu, sigma) Determines the range of values in the matrix shape:tuple(optional) Size of the randomly generated data missingness: ('mcar', 'mar', 'mnar') Type of missigness you want in your dataset th: float between [0,1] Percentage of missing data in generated data dtype: ('int','float') Type of data Returns ------- numpy.ndarray """ mean, sigma = theta data = np.random.normal(mean, sigma, size=shape) if dtype == "int": data = np.round(data) elif dtype == "float": pass corruptor = Corruptor(data, thr=thr) raw_data = getattr(corruptor, missingness)() return raw_data
def impute_parameter_adjustment(method, param_grid, impute_radio, x_init, y_init, reference_x, reference_y): model = joblib.load('..\\models\\vote_model_hard.joblib') markers = ['o', '*', '1', 's', '2'] I = 20 for radio, marker in zip(impute_radio, markers): acc_1 = {i: 0 for i in param_grid} acc_2 = {i: 0 for i in param_grid} for m in range(I): corruptor = Corruptor(x_init, radio) x_miss = getattr(corruptor, "mcar")() for n in param_grid: if method == 'knn': x_impute = fancyimpute.KNN(k=n).fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])] if method == 'mice': data_impute_list = [] for i in range(n): imputer = fancyimpute.IterativeImputer( n_iter=13, sample_posterior=True, random_state=i) data_impute_list.append( imputer.fit_transform( np.vstack( (x_miss, reference_x)))[range(x_init.shape[0])]) x_impute = np.mean(data_impute_list, 0) print(radio, m, n) if method == 'em': x_impute = em(np.vstack((x_miss, reference_x)), loops=n)[range(x_init.shape[0])] if method == 'som': x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])] y_pred1 = model.predict(x_impute) y_pred2 = model.predict(x_init) acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2) acc_2[n] += 1 - accuracy_score(y_pred1, y_init) acc_1 = {i: (j / I) for i, j in acc_1.items()} acc_2 = {i: (j / I) for i, j in acc_2.items()} plt.subplot(121) plt.plot(acc_1.keys(), acc_1.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and prediction') plt.subplot(122) plt.plot(acc_2.keys(), acc_2.values(), marker=marker, label='%.1f%%' % (radio * 100)) plt.xlabel('K') plt.ylabel('CER between imputation and real label') plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5) plt.show()
def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2): """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character) Parameters ---------- nlevels: int Specify the number of different categories in the dataset shape: tuple(optional) Size of the randomly generated data missingness: string in ('mcar', 'mar', 'mnar') Type of missingness you want in your dataset thr: float between [0,1] Percentage of missing data in generated data Returns ------- numpy.ndarray """ if shape[0] * shape[1] < nlevels: raise error.BadInputError( "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape" ) length = len(string.ascii_lowercase) n_fold = int(math.floor(math.log(nlevels, length))) cat_pool = list(string.ascii_lowercase) # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data if n_fold > 0: for i in range(2, n_fold + 2): pool_candidate = list( itertools.product(string.ascii_lowercase, repeat=i)) cat_pool.extend([''.join(w) for w in pool_candidate]) if len(cat_pool) > nlevels: break cat = random.sample(cat_pool, nlevels) data = np.random.choice(cat, shape, replace=True) # make sure the data frame has nlevel different categories while len(np.unique(data)) != nlevels: data = np.random.choice(cat, shape, replace=True) corruptor = Corruptor(data, thr=thr, dtype=np.str) raw_data = getattr(corruptor, missingness)() return raw_data
def mnist(missingness="mcar", thr=0.2): """ Loads corrupted MNIST Parameters ---------- missingness: ('mcar', 'mar', 'mnar') Type of missigness you want in your dataset th: float between [0,1] Percentage of missing data in generated data Returns ------- numpy.ndarray """ from sklearn.datasets import fetch_mldata dataset = fetch_mldata('MNIST original') corruptor = Corruptor(dataset.data, thr=thr) data = getattr(corruptor, missingness)() return {"X": data, "Y": dataset.target}
CER_result_1 = pd.DataFrame(data=None, columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"], index=[5, 10, 20, 30, 40]) for impute_radio in [0.05, 0.1, 0.2, 0.3, 0.4]: mae_df = pd.DataFrame(data=None, columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"], index=np.arange(0, 20)) cer_df = pd.DataFrame(data=None, columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"], index=np.arange(0, 20)) cer_df_1 = pd.DataFrame(data=None, columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"], index=np.arange(0, 20)) for j in range(20): corruptor = Corruptor(x_init, impute_radio) x_miss = getattr(corruptor, "mcar")() print(len(list(x_miss[np.isnan(x_miss)]))) x_miss = np.vstack((x_miss, reference_x)) col_num = 0 #mean、mice插补 for i in ['mean', 'mice']: print(i, '插补---------------------------------') x_impute = impute(x_miss, i)[range(x_init.shape[0])] mae_df.iloc[j, col_num] = sum(sum(np.abs(x_impute - x_init))) cer_df.iloc[j, col_num] = CER(x_impute, x_init, y_init, 1) cer_df_1.iloc[j, col_num] = CER(x_impute, x_init, y_init, 2) col_num += 1 #KNN插补(n=3、5、10) for n in [11]: print('KNN-', n, '插补---------------------------------')
def MLP_impute(x,y): for i, j in np.argwhere(np.isnan(x)): temp = np.delete(x,[j],axis=1) temp = np.array(pd.DataFrame(np.hstack([temp,y.reshape(-1,1)]).dropna(axis=0))) y_train = temp[:, -1] x_train = np.delete(temp,[-1],axis=1) if __name__ == '__main__': os.chdir("..\\datas") x_init = joblib.load('imputation_x.joblib') y_init = joblib.load('imputation_y.joblib') reference_x = joblib.load('reference_x.joblib') reference_y = joblib.load('reference_y.joblib') corruptor = Corruptor(x_init, 0.1) x_miss = getattr(corruptor, "mcar")() x_miss = np.vstack((x_init, reference_x)) y_miss = pd.concat((y_init,reference_y),axis=0) data_df = pd.DataFrame(pd.concat((pd.DataFrame(x_miss),y_miss.reset_index(drop=True)),axis=1)) print(data_df.head()) x_train,x_test,y_train,y_test = train_test_split(np.array(data_df.dropna(axis=0,how='any').drop(columns='总分')), np.array(data_df.dropna(axis=0,how='any')['总分']),test_size=0.2, random_state=7) w,b = SLP_train(x_train,y_train,learning_rate=0.001,epochs=100,batch_size=10)