def main(args, yy): '''Main function Args: - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m, ward_nor_list = data_loader(miss_rate, yy) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) RMSE, MAE = test_loss(ori_data_x, imputed_data_x, ward_nor_list) return RMSE, MAE
def main(args): ''' Args: - data_name: LossSight - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } file_name = 'data/' + data_name + '.csv' miss_data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Save Result np.savetxt("result.csv", imputed_data_x, delimiter=',') return imputed_data_x
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate, args.mechanism) # Impute missing data if args.kfold: rmse_list = [] for i, (train_index, test_index) in enumerate( KFold(shuffle=True, random_state=1).split(ori_data_x)): rmse = gain(miss_data_x, gain_parameters, ori_data_x, train_index, test_index, args.mechanism) rmse_list.append(rmse) print(np.mean(rmse_list), np.std(rmse_list)) else: train_index = test_index = range(len(ori_data_x)) gain(miss_data_x, gain_parameters, ori_data_x, train_index, test_index, args.mechanism)
def test_001_t(self): src_data = (0, 1, -2, 5.5, -0.5) expected_result = (0, 2, -4, 11, -1) src = blocks.vector_source_f(src_data) gain = gain(2) snk = blocks.vector_sink_f() self.tb.connect(src, gain) self.tb.connect(gain, snk) self.tb.run() result_data = snk.data() self.assertFloatTuplesAlmostEqual(expected_result, result_data, 6) # set up fg self.tb.run()
def main(args): '''Main function Args: - data_name: the file name of dataset - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations - onehot: the number of feature for onehot encoder (start from first feature) - predict: option for prediction mode, no ramdom mask and save model if on Returns: - imputed_data_x: imputed data ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'data_name': args.data_name, 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations, 'onehot': args.onehot, 'predict': args.predict } # Load data and introduce missingness ori_data_x, miss_data_x, data_m, feature_name, onehotencoder, ori_data_dim = data_loader( data_name, miss_rate, args.onehot, args.predict) # Impute missing data imputed_data_x = gain(miss_data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters) # Save imputed data pd.DataFrame(imputed_data_x, columns=feature_name).to_csv( 'data_imputed/' + data_name + "_imputed.csv", index=False, header=True) return imputed_data_x
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) mae = mae_loss(ori_data_x, imputed_data_x, data_m) print() print('RMSE Performance: ' + str(np.round(rmse, 4))) print('MAE Performance: ' + str(np.round(mae, 4))) return imputed_data_x, rmse
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_path = args.data_path output_path = args.output_path gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and replace missingness to nan miss_data_x = data_replace(data_path) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) np.savetxt(output_path, imputed_data_x, delimiter=',') # # Report the RMSE performance # rmse = rmse_loss (ori_data_x, imputed_data_x, data_m) # print() # print('RMSE Performance: ' + str(np.round(rmse, 4))) return imputed_data_x
def main(): data_names = ['letter', 'spam'] # data_names = ['breasttissue','glass', 'thyroid'] # data with continuous feature and not originally missing #data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel', # 'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst', # 'thyroid','vehicle','vertebral','wine','yeast'] print(len(data_names)) miss_rate = 0.2 batch_size = 64 alpha = 100 iterations = 1000 n_times = 3 wb = xlwt.Workbook() sh_rmse = wb.add_sheet("GAIN_rmse") # sh_acc = wb.add_sheet("EGAIN_acc") sh_acc_dct = wb.add_sheet("GAIN_acc_dct") sh_acc_knn = wb.add_sheet("GAIN_acc_knn") sh_acc_nb = wb.add_sheet("GAIN_acc_nb") sh_acc_lr = wb.add_sheet("GAIN_acc_lr") for k in range(len(data_names)): data_name = data_names[k] gain_parameters = { 'batch_size': batch_size, 'alpha': alpha, 'iterations': iterations } print("Dataset: ", data_name) rmse = [] # acc_dct = [] # acc_knn = [] # acc_nb = [] ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate) sh_rmse.write(0, k, data_name) sh_acc_dct.write(0, k, data_name) sh_acc_knn.write(0, k, data_name) sh_acc_nb.write(0, k, data_name) sh_acc_lr.write(0, k, data_name) # sh_acc.write(0, 0, 'dct') # sh_acc.write(0, 1, 'knn') # sh_acc.write(0, 2, 'nb') for i in range(n_times): # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) imputed_data_x, _ = normalization(imputed_data_x) # Calculate rmse rmse.append(rmse_loss(ori_data_x, imputed_data_x, m)) print('{:2d}/{:2d}'.format(i + 1, n_times), end=':') print('RMSE = ' + str(np.round(rmse[-1], 4))) sh_rmse.write(i + 1, k, str(np.round(rmse[-1], 4))) if data_name in ['letter', 'spam']: continue scf = StratifiedShuffleSplit(n_splits=10) score_dct = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_dct) # acc_dct.extend(score_dct) sh_acc_dct.write(i + 1, k, str(np.round(np.mean(score_dct), 4))) # for j in range(len(score_dct)): # sh_acc.write(i * 5 + j + 1, 0, str(np.round(score_dct[j], 4))) score_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_knn) # acc_knn.extend(score_knn) sh_acc_knn.write(i + 1, k, str(np.round(np.mean(score_knn), 4))) # for j in range(len(score_knn)): # sh_acc.write(i * 5 + j + 1, 1, str(np.round(score_knn[j], 4))) score_nb = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_nb) # acc_nb.extend(score_nb) sh_acc_nb.write(i + 1, k, str(np.round(np.mean(score_nb), 4))) # for j in range(len(score_nb)): # sh_acc.write(i * 5 + j + 1, 2, str(np.round(score_nb[j], 4))) score_lr = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, scoring='accuracy') print(score_lr) # acc_nb.extend(score_nb) sh_acc_lr.write(i + 1, k, str(np.round(np.mean(score_lr), 4))) # rmse = np.array(rmse) # acc_dct = np.array(acc_dct) # acc_knn = np.array(acc_knn) # acc_nb = np.array(acc_nb) # print("RMSE mean = {:.4f}; variance = {:.4f} ".format(np.mean(rmse), np.std(rmse))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_dct), np.std(acc_dct))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_knn), np.std(acc_knn))) # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_nb), np.std(acc_nb))) print("---------------------------") wb.save('GAIN_results_15.xls')
plt.tight_layout(rect=[0, 0.03, 0, 1.25]) plt.subplots_adjust(hspace=1, wspace=0.35) plt.show() X = np.asarray(all_values_for_variables).astype(np.float).transpose() gain_parameters = { 'batch_size': 128, 'hint_rate': 0.9, 'alpha': 1000, 'iterations': 1000 } #imputer = KNNImputer(n_neighbors=5) #imputed_variable_values = imputer.fit_transform(X).transpose() imputed_variable_values = gain(X, gain_parameters).transpose() df = pd.DataFrame(imputed_variable_values) patients_labels = [] patients_features = [] PLT_ALL = True for i in range(0, len(selected_patient_ids)): patient_id = selected_patient_ids[i] patient_features = [] for k in range(0, len(selected_variables)): variable_name = selected_variables[k] imputed_values_for_variable = df.values[k, :]
def main(iterations=NUM_ITERATIONS, batch_size=128, hint_rate=0.5, miss_rate=0.3): gain_parameters = { 'batch_size': batch_size, 'hint_rate': hint_rate, 'iterations': iterations } enable_transform = False remove_outliers = False n_time_points = 3 data_x = pickle.load(open('./missing_data.sav', 'rb')) data_x = data_x.transpose().astype(np.float)[:, :] # Remove variables with more no, dim = data_x.shape removed = 0 for d in range(0, dim): if variables[d - removed] in remove_variables: variables.remove(variables[d - removed]) data_x = np.delete(data_x, d - removed, axis=1) removed += 1 no, dim = data_x.shape if len(variables) != dim: print(len(variables), dim) print('Incompatible dimensions.') exit() no_total = no * dim no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True) no_not_nan = no_total - no_nan n_patients = int(no / n_time_points) miss_data_x = np.copy(data_x) print('Input shape', no, 'x', dim) print('NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) # Introduce missing data data_m = binary_sampler(1 - miss_rate, no, dim) miss_data_x[data_m == 0] = np.nan transformer = RobustScaler() miss_data_x = transformer.fit_transform(miss_data_x) no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True) no_not_nan = no_total - no_nan print('After removal, NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) real_miss_rate = (no_nan / no_total * 100) miss_data_x_gan_tmp = np.zeros((n_patients, dim * n_time_points)) # Swap (one row per time point) to (one column per time point) for i in range(0, n_patients): for j in range(0, dim): for n in range(0, n_time_points): miss_data_x_gan_tmp[i, n * dim + j] = miss_data_x[i * n_time_points + n, j] imputed_data_x_gan_tmp = gain(miss_data_x_gan_tmp, gain_parameters) imputed_data_x_gan = np.copy(miss_data_x) ## Swap (one column per time point) to (one row per time point) for i in range(0, n_patients): for j in range(0, dim): for n in range(0, n_time_points): imputed_data_x_gan[i * n_time_points + n, j] = imputed_data_x_gan_tmp[i, n * dim + j] imputer = KNNImputer(n_neighbors=5) imputed_data_x_knn = imputer.fit_transform(miss_data_x) imputer = IterativeImputer(verbose=True) imputed_data_x_mice = imputer.fit_transform(miss_data_x) imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan) imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn) imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice) # Save imputed data to disk pickle.dump(imputed_data_x_gan, open('./filled_data.sav', 'wb')) # Get residuals for computation of stats distances_gan = np.zeros((dim, n_time_points * n_patients)) distances_knn = np.zeros((dim, n_time_points * n_patients)) distances_mice = np.zeros((dim, n_time_points * n_patients)) distributions = {'deleted': [], 'gan': [], 'knn': [], 'mice': []} from scipy.stats import iqr for j in range(0, dim): nn_values = data_x[:, j].flatten() nn_values = nn_values[~np.isnan(nn_values)] dim_iqr = np.mean(nn_values) # iqr(nn_values) for i in range(0, n_patients): variable_name = variables[j] i_start = int(i * n_time_points) i_stop = int(i * n_time_points + n_time_points) original_tuple = data_x[i_start:i_stop, j] corrupted_tuple = miss_data_x[i_start:i_stop, j] imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop, j] imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop, j] imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop, j] #if i == 1 or i == 2: # print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn) for k in range(0, n_time_points): a, b, c, d = original_tuple[k], imputed_tuple_gan[k], \ imputed_tuple_knn[k], imputed_tuple_mice[k] if np.isnan(a) or data_m[i_start + k, j] != 0: continue #if i % 10 == 0: print(variable_name, a,b,c,d, b-a) distances_gan[j, i * k] = (b - a) distances_knn[j, i * k] = (c - a) distances_mice[j, i * k] = (d - a) # Compute distance statistics all_stats = {} for j in range(0, dim): print('%d. Imputed variable: %s' % (j, variables[j])) current_stats = {'gan': {}, 'knn': {}, 'mice': {}} # make a copy # Stats for original data dim_mean = np.mean([x for x in data_x[:, j] if not np.isnan(x)]) dim_max = np.max([x for x in data_x[:, j] if not np.isnan(x)]) dim_iqr = iqr([x for x in data_x[:, j] if not np.isnan(x)]) # Indices for removed data ind = (data_m[:, j] == 0).flatten() & (~np.isnan(data_x[:, j])).flatten() # Stats for GAN current_stats['gan']['bias'] = np.mean(distances_gan[j]) current_stats['gan']['rmse'] = np.sqrt(np.mean(distances_gan[j]**2)) current_stats['gan']['nrmse'] = current_stats['gan']['rmse'] / dim_iqr current_stats['gan']['mape'] = np.mean(np.abs(distances_gan[j])) current_stats['gan']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_gan[ind, j].flatten()) # Stats for KNN current_stats['knn']['bias'] = np.mean(distances_knn[j]) current_stats['knn']['rmse'] = np.sqrt(np.mean(distances_knn[j]**2)) current_stats['knn']['nrmse'] = current_stats['knn']['rmse'] / dim_iqr current_stats['knn']['mape'] = np.mean(np.abs(distances_knn[j])) current_stats['knn']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_knn[ind, j].flatten()) # Stats for MICE current_stats['mice']['bias'] = np.mean(distances_mice[j]) current_stats['mice']['rmse'] = np.sqrt(np.mean(distances_mice[j]**2)) current_stats['mice'][ 'nrmse'] = current_stats['mice']['rmse'] / dim_iqr current_stats['mice']['mape'] = np.mean(np.abs(distances_mice[j])) current_stats['mice']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_mice[ind, j].flatten()) for model_name in current_stats: model = current_stats[model_name] print('... %s - bias: %.3f, RMSE: %.3f, ME: %.3f, WD: %.3f' % \ (model_name, model['bias'], model['rmse'], model['mape'], model['wd'])) all_stats[variables[j]] = dict(current_stats) print() n_fig_rows, n_fig_cols = 6, 6 n_fig_total = n_fig_rows * n_fig_cols if dim > n_fig_total: print('Warning: not all variables plotted') all_fig_axes = [ plt.subplots(n_fig_rows, n_fig_cols, figsize=(15, 15)) for _ in range(0, 3) ] for j in range(0, dim): dim_not_nan = np.count_nonzero(~np.isnan(data_x[:, j])) deleted_no = np.count_nonzero( np.isnan(miss_data_x[:, j]) & ~np.isnan(data_x[:, j])) ax_title = variables[j] + (' (%d of %d observed)' % (deleted_no, dim_not_nan)) dim_axes = [ fig_axes[1][int(j / n_fig_cols), j % n_fig_cols] for fig_axes in all_fig_axes ] [ ax.set_title(ax_title, fontdict={ 'fontsize': 7, 'fontweight': 'bold' }) for ax in dim_axes ] input_arrays = [ data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice ] output_arrays = [ np.asarray([input_arr[ii,j] for ii in range(0, no) if \ (not np.isnan(data_x[ii,j]) and \ data_m[ii,j] == 0)]) for input_arr in input_arrays ] deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays plot_distribution_densities(output_arrays, all_stats, variables[j], dim_axes[0]) plot_distribution_residuals(output_arrays, dim_axes[1]) plot_distribution_summaries(output_arrays, dim_axes[2]) # Make QQ plot of original and deleted values vs. normal distribution #dist_max = np.max(np.concatenate([imputed_values_gan, deleted_values])) #qqplot_1sample((data_x[~np.isnan(data_x[:,j]),j] - dist_min) / dist_max, ax=ax3, color='b') #qqplot_1sample((data_x[data_m[:,j] == 0,j] - dist_min) / dist_max, ax=ax3, color='r',draw_line=False) # Figure 1 fig1 = all_fig_axes[0][0] top_title = 'Kernel density estimation for erased and predicted values, for each imputation method' fig1.suptitle(top_title, fontsize=8) fig1.tight_layout(rect=[0, 0.03, 0, 1.25]) fig1.subplots_adjust(hspace=1, wspace=0.35) # Figure 2 fig2 = all_fig_axes[1][0] top_title = 'Q-Q plot of erased vs. imputed values, for each imputation method' fig2.suptitle(top_title, fontsize=8) fig2.tight_layout(rect=[0, 0.03, 0, 1.25]) fig2.subplots_adjust(hspace=1, wspace=0.35) # Figure 3 fig3 = all_fig_axes[2][0] top_title = 'Bayesian confidence intervals for the mean and standard deviation, for erased values and imputed values' fig3.suptitle(top_title, fontsize=8) fig3.tight_layout(rect=[0, 0.03, 0, 1.25]) fig3.subplots_adjust(hspace=1, wspace=0.35) # Figure 4 fig5, ax5 = plt.subplots(1, 1) top_title = 'Distribution of normalized RMSEs for each imputation method' fig5.suptitle(top_title, fontsize=8) plot_error_distributions(all_stats, fig5, ax5) ax5.set_ylabel('Probability density', fontsize=6) ax5.set_xlabel('NRMSE (normalized to IQR)', fontsize=6) ax5.legend(fontsize=6) fig5.tight_layout(rect=[0, 0.03, 0, 1.25]) fig5.subplots_adjust(hspace=1, wspace=0.35) plt.show() for model_name in ['gan', 'knn', 'mice']: wds = [ all_stats[variable_name][model_name]['wd'] for variable_name in all_stats ] nrmses = [ all_stats[variable_name][model_name]['nrmse'] for variable_name in all_stats ] mwd = np.round(np.asarray(wds).mean(), 2) mnrmse = np.round(np.asarray(nrmses).mean(), 2) print('Model: %s - average WD = %.2f, average NRMSE = %.2f ' % (model_name, mwd, mnrmse)) return all_stats
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m, data_y = data_loader(data_name, miss_rate) imputed_data_x = gain(miss_data_x, gain_parameters) #pd.DataFrame(data_y, imputed_data_x, axis = 1) # Step- craete data_m using testdata # Step - combine train and missing_test_data # Step - retrun total missing and original data_m # Step - while calculating RMSE # use original as test_original # fetch testing imputed datset 934 to last # data_m as missing_test_data if data_name == 'vals_test_df': imputed_data_x = imputed_data_x[range(918, 1311), :] elif data_name == 'vals_test_df_test_type1': imputed_data_x = imputed_data_x[range(495, 1311), :] elif data_name == 'vals_test_df_test_type2': imputed_data_x = imputed_data_x[range(816, 1311), :] else: imputed_data_x = imputed_data_x imputed_data_x_df = pd.DataFrame(imputed_data_x) data_y_df = pd.DataFrame(data_y) imputed_data_df = pd.concat([data_y_df, imputed_data_x_df], ignore_index=True, axis=1) imputed_data_df.to_csv("GAN_imputated_catalogueData1.csv", index=False) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() print('RMSE Performance: ' + str(np.round(rmse, 4))) return imputed_data_x, rmse
def main(): data_names = ['letter', 'spam'] # data_names = ['breasttissue','glass', 'thyroid'] # data with continuous feature and not originally missing # data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel', # 'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst', # 'thyroid','vehicle','vertebral','wine','yeast'] print(len(data_names)) miss_rate = 0.2 batch_size = 64 alpha = 100 iterations = 1000 n_times = 30 wb_gain = xlwt.Workbook() sh_rmse_gain = wb_gain.add_sheet("GAIN_rmse") sh_acc_dct_gain = wb_gain.add_sheet("GAIN_acc_dct") sh_acc_knn_gain = wb_gain.add_sheet("GAIN_acc_knn") sh_acc_nb_gain = wb_gain.add_sheet("GAIN_acc_nb") sh_acc_lr_gain = wb_gain.add_sheet("GAIN_acc_lr") wb_egain = xlwt.Workbook() sh_rmse_egain = wb_egain.add_sheet("EGAIN_rmse") sh_acc_dct_egain = wb_egain.add_sheet("EGAIN_acc_dct") sh_acc_knn_egain = wb_egain.add_sheet("EGAIN_acc_knn") sh_acc_nb_egain = wb_egain.add_sheet("EGAIN_acc_nb") sh_acc_lr_egain = wb_egain.add_sheet("EGAIN_acc_lr") wb_mean = xlwt.Workbook() sh_rmse_mean = wb_mean.add_sheet("MEAN_rmse") sh_acc_dct_mean = wb_mean.add_sheet("MEAN_acc_dct") sh_acc_knn_mean = wb_mean.add_sheet("MEAN_acc_knn") sh_acc_nb_mean = wb_mean.add_sheet("MEAN_acc_nb") sh_acc_lr_mean = wb_mean.add_sheet("MEAN_acc_lr") wb_knn = xlwt.Workbook() sh_rmse_knn = wb_knn.add_sheet("KNN_rmse") sh_acc_dct_knn = wb_knn.add_sheet("KNN_acc_dct") sh_acc_knn_knn = wb_knn.add_sheet("KNN_acc_knn") sh_acc_nb_knn = wb_knn.add_sheet("KNN_acc_nb") sh_acc_lr_knn = wb_knn.add_sheet("KNN_acc_lr") for k in range(len(data_names)): data_name = data_names[k] gain_parameters = { 'batch_size': batch_size, 'alpha': alpha, 'iterations': iterations } ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate) print("Dataset: ", data_name) ###########################Mean imputation################################# print('Mean imputation') sh_rmse_mean.write(0, k, data_name) sh_acc_dct_mean.write(0, k, data_name) sh_acc_knn_mean.write(0, k, data_name) sh_acc_nb_mean.write(0, k, data_name) sh_acc_lr_mean.write(0, k, data_name) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imputed_data_x = imp.fit_transform(miss_data_x) sh_rmse_mean.write( 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_mean = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_mean.write(1, k, np.round(np.mean(score_dct_mean), 4)) # # KNN classifier # score_knn_mean = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_mean.write(1, k, np.round(np.mean(score_knn_mean), 4)) # # NB classifier # score_nb_mean = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_mean.write(1, k, np.round(np.mean(score_nb_mean), 4)) # # LR classifier # score_lr_mean = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_mean.write(1, k, np.round(np.mean(score_lr_mean), 4)) ###########################KNN imputation################################# print('KNN imputation') sh_rmse_knn.write(0, k, data_name) sh_acc_dct_knn.write(0, k, data_name) sh_acc_knn_knn.write(0, k, data_name) sh_acc_nb_knn.write(0, k, data_name) sh_acc_lr_knn.write(0, k, data_name) imp = KNNImputer(missing_values=np.nan) imputed_data_x = imp.fit_transform(miss_data_x) sh_rmse_knn.write( 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_knn = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_knn.write(1, k, np.round(np.mean(score_dct_knn), 4)) # # KNN classifier # score_knn_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_knn.write(1, k, np.round(np.mean(score_knn_knn), 4)) # # NB classifier # score_nb_knn = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_knn.write(1, k, np.round(np.mean(score_nb_knn), 4)) # # LR classifier # score_lr_knn = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_knn.write(1, k, np.round(np.mean(score_lr_knn), 4)) ###########################GAIN imputation################################# print('GAIN imputation') sh_rmse_gain.write(0, k, data_name) sh_acc_dct_gain.write(0, k, data_name) sh_acc_knn_gain.write(0, k, data_name) sh_acc_nb_gain.write(0, k, data_name) sh_acc_lr_gain.write(0, k, data_name) for i in tqdm(range(n_times)): # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) sh_rmse_gain.write( i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # #Normalize data and classification # imputed_data_x,_ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # #DCT classifier # score_dct_gain = cross_val_score(DecisionTreeClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_gain.write(i+1, k, np.round(np.mean(score_dct_gain), 4)) # #KNN classifier # score_knn_gain = cross_val_score(KNeighborsClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_gain.write(i+1, k, np.round(np.mean(score_knn_gain), 4)) # #NB classifier # score_nb_gain = cross_val_score(GaussianNB(),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_gain.write(i+1, k, np.round(np.mean(score_nb_gain), 4)) # #LR classifier # score_lr_gain = cross_val_score(LogisticRegression(max_iter=1000),imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_lr_gain.write(i+1, k, np.round(np.mean(score_lr_gain), 4)) ###########################EGAIN imputation################################# print('EGAIN imputation') sh_rmse_egain.write(0, k, data_name) sh_acc_dct_egain.write(0, k, data_name) sh_acc_knn_egain.write(0, k, data_name) sh_acc_nb_egain.write(0, k, data_name) sh_acc_lr_egain.write(0, k, data_name) for i in tqdm(range(n_times)): imputed_data_x = Egain(miss_data_x, gain_parameters) sh_rmse_egain.write( i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4)) # Normalize data and classification # imputed_data_x, _ = normalization(imputed_data_x) # # scf = StratifiedShuffleSplit(n_splits=10) # # DCT classifier # score_dct_egain = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_dct_egain.write(i + 1, k, np.round(np.mean(score_dct_egain), 4)) # # KNN classifier # score_knn_egain = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_knn_egain.write(i + 1, k, np.round(np.mean(score_knn_egain), 4)) # # NB classifier # score_nb_egain = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy') # sh_acc_nb_egain.write(i + 1, k, np.round(np.mean(score_nb_egain), 4)) # # LR classifier # score_lr_egain = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf, # scoring='accuracy') # sh_acc_lr_egain.write(i + 1, k, np.round(np.mean(score_lr_egain), 4)) wb_gain.save('GAIN_test.xls') wb_egain.save('EGAIN_test.xls') wb_mean.save('MEAN_test.xls') wb_knn.save('KNN_test.xls')
[VOLUME, G4], [0, C5], [VOLUME, G4], [VOLUME, C5], [VOLUME, F5], [VOLUME, E5], [VOLUME, C5], ] from fade import fade from gain import gain from repeat import repeat from square import square_wave all_samples = [] quarter_second = 44100 // 4 for volume, frequency in notes: samples = square_wave(int(44100 / frequency // 2)) samples = gain(samples, volume) samples = repeat(samples, quarter_second) samples = fade(samples, quarter_second) all_samples.extend(samples) all_samples = [int(sample) for sample in all_samples] w = wave.open('music.wav', 'wb') w.setnchannels(1) w.setsampwidth(2) w.setframerate(44100) w.writeframes(struct.pack('<' + 'h' * len(all_samples), *all_samples))
def main(args): '''Main function for UCI letter and spam datasets. Args: - data_name: letter or spam - miss_rate: probability of missing components - batch:size: batch size - hint_rate: hint rate - alpha: hyperparameter - iterations: iterations Returns: - imputed_data_x: imputed data - rmse: Root Mean Squared Error ''' data_name = args.data_name miss_rate = args.miss_rate gain_parameters = { 'batch_size': args.batch_size, 'hint_rate': args.hint_rate, 'alpha': args.alpha, 'iterations': args.iterations } # Load data and introduce missingness ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate) # Impute missing data imputed_data_x = gain(miss_data_x, gain_parameters) # Report the RMSE performance rmse = rmse_loss(ori_data_x, imputed_data_x, data_m) print() mi_data = miss_data_x.astype(float) no, dim = imputed_data_x.shape miss_data = np.reshape(mi_data, (no, dim)) np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f') print('Shape of miss data: ', miss_data.shape) print('Save results in missing_data.csv') print() print('=== GAIN RMSE ===') print('RMSE Performance: ' + str(np.round(rmse, 6))) #print('Kích thước của file đầu ra: ', imputed_data_x.shape) np.savetxt("data/imputed_data.csv", imputed_data_x, delimiter=',', fmt='%d') print('Save results in Imputed_data.csv') # MissForest print() print('=== MissForest RMSE ===') data = miss_data_x imp_mean = MissForest(max_iter=5) miss_f = imp_mean.fit_transform(data) #miss_f = pd.DataFrame(imputed_train_df) rmse_MF = rmse_loss(ori_data_x, miss_f, data_m) print('RMSE Performance: ' + str(np.round(rmse_MF, 6))) np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d') print('Save results in Imputed_data_MF.csv') # MICE From Auto Impute print() print('=== MICE of Auto Impute RMSE ===') data_mice = pd.DataFrame(miss_data_x) mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True, seed=None, strategy='default predictive', visit='default') mice_out = mi.fit_transform(data_mice) c = [list(x) for x in mice_out] c1 = c[0] c2 = c1[1] c3 = np.asarray(c2) mice_x = c3 #print('here :', mice_x, miss_f, miss_f.shape) rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m) print('=== MICE of Auto Impute RMSE ===') print('RMSE Performance: ' + str(np.round(rmse_MICE, 6))) np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d') print('Save results in Imputed_data_MICE.csv') return imputed_data_x, rmse
for k in range(0, input_array_tmp.shape[0]): n_filled = np.sum([1 \ for x in input_array_tmp[k,:] \ if x is not None]) n_total = input_array_tmp.shape[1] print(n_filled / n_total) if n_filled / n_total < 0.5: continue input_array.append(input_array_tmp[k,:]) input_array = np.asarray(input_array) input_shape = input_array.shape print('Feature array shape: %s' % str(feature_shape)) print('Input shape for imputer: %s' % str(input_shape)) imputed_array = gain(\ input_array.astype(np.float), gain_parameters) from fancyimpute import KNN imp_mean = KNN(5) # IterativeImputer() imp_mean.fit_transform(input_array) #imp_mean.transform(input_array) df = pd.DataFrame(imputed_array) plt.matshow(df.corr()) #plt.title('Correlations between laboratory parameters over time', y=-0.01) plt.xticks(range(0,len(feature_rows)), feature_rows, rotation='vertical',fontsize='6') plt.yticks(range(0,len(feature_rows)), feature_rows, fontsize='6') plt.gca().xaxis.set_ticks_position('top') plt.colorbar()
def main(): # data_names = ['letter', 'spam','credit','breast'] # data_names = ['spam', 'letter','breast','banknote'] data_names = ['parkinsons'] # data_names = ['breast','banknote','connectionistvowel'] # data_names = ['connectionistvowel'] # data_names = ['parkinsons','seedst'] # data_names = ['breasttissue','glass', 'thyroid'] # data_names = ['credit', 'breast', 'balance','banknote','blood','climate','connectionistvowel', # 'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst', # 'thyroid','vehicle','vertebral','wine','yeast'] # data_names = ['parkinsons'] # data_names = ['balance','banknote','blood','connectionistvowel','vehicle','yeast'] miss_rate = 0.1 batch_size = 128 alpha = 100 iterations = 10000 n_times = 30 gain_parameters = { 'batch_size': batch_size, 'alpha': alpha, 'iterations': iterations } wb = xlwt.Workbook() sh_dct_mgain = wb.add_sheet("DCT_mgain") sh_mlp_mgain = wb.add_sheet("MLP_mgain") sh_dct_gain = wb.add_sheet("DCT_gain") sh_mlp_gain = wb.add_sheet("MLP_gain") for k in range(len(data_names)): data_name = data_names[k] sh_dct_mgain.write(0, k, data_name) sh_mlp_mgain.write(0, k, data_name) sh_dct_gain.write(0, k, data_name) sh_mlp_gain.write(0, k, data_name) print("Dataset: ", data_name) ori_data_x, y = data_loader(data_name) train_idx, test_idx = train_test_split(range(len(y)), test_size=0.3, stratify=y, random_state=42) for i in tqdm(range(n_times)): miss_data_x, m = make_missing_data(ori_data_x, miss_rate, seed=i) # Impute missing data # 20 imputed data for MGAIN # 1 imputed data for GAIN imputed_data_xs = gain(miss_data_x, gain_parameters, n_times=21) imputed_data_xs = np.array(imputed_data_xs) # Normalize data imputed_data_xs = np.array( [normalization(data)[0] for data in imputed_data_xs]) #classify num_cores = multiprocessing.cpu_count() # Training with MLP # 20 for esemble learning MGAIN # 1 for GAIN clfs = Parallel(n_jobs=num_cores)( delayed(train_MLP)(imputed_data_xs[i][train_idx], y[train_idx]) for i in range(imputed_data_xs.shape[0])) # Testing # MGAIN x_test = np.mean(imputed_data_xs[1:, test_idx, :], axis=0) ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test) for clf in clfs[1:]) ys = np.array(ys) # voting y_test = [ max(set(list(ys[:, i])), key=list(ys[:, i]).count) for i in range(ys.shape[1]) ] score = accuracy_score(y[test_idx], y_test) sh_mlp_mgain.write(i + 1, k, np.round(score, 4)) score = accuracy_score( y[test_idx], predict(clfs[0], imputed_data_xs[0][test_idx])) sh_mlp_gain.write(i + 1, k, np.round(score, 4)) # # Training with LR # # 20 for esemble learning MGAIN # # 1 for GAIN # clfs = Parallel(n_jobs=num_cores)(delayed(train_LR)(imputed_data_xs[i][train_idx], y[train_idx]) # for i in range(imputed_data_xs.shape[0])) # # # Testing # # MGAIN # x_test = np.mean(imputed_data_xs[1:,test_idx,:],axis=0) # ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test) # for clf in clfs[1:]) # ys = np.array(ys) # # voting # y_test = [max(set(list(ys[:,i])),key=list(ys[:,i]).count) for i in range(ys.shape[1])] # score = accuracy_score(y[test_idx],y_test) # sh_lr_mgain.write(i+1,k,np.round(score,4)) # # score = accuracy_score(y[test_idx], predict(clfs[0],imputed_data_xs[0][test_idx])) # sh_lr_gain.write(i + 1, k, np.round(score, 4)) # Training with DCT # 20 for esemble learning MGAIN # 1 for GAIN clfs = Parallel(n_jobs=num_cores)( delayed(train_DCT)(imputed_data_xs[i][train_idx], y[train_idx]) for i in range(imputed_data_xs.shape[0])) # Testing # MGAIN x_test = np.mean(imputed_data_xs[1:, test_idx, :], axis=0) ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test) for clf in clfs[1:]) ys = np.array(ys) # voting y_test = [ max(set(list(ys[:, i])), key=list(ys[:, i]).count) for i in range(ys.shape[1]) ] score = accuracy_score(y[test_idx], y_test) sh_dct_mgain.write(i + 1, k, np.round(score, 4)) score = accuracy_score( y[test_idx], predict(clfs[0], imputed_data_xs[0][test_idx])) sh_dct_gain.write(i + 1, k, np.round(score, 4)) wb.save("./final_results/Mgain_10_parkinsons.xls")
def main (alpha=1000, batch_size=128, hint_rate=0.5, iterations=900, miss_rate=0.3): gain_parameters = {'batch_size': batch_size, 'hint_rate': hint_rate, 'alpha': alpha, 'iterations': iterations} # Load data and introduce missingness #file_name = 'data/spam.csv' #data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) enable_transform = False remove_outliers = False n_time_points = 3 data_x = pickle.load(open('./missing_data.sav', 'rb')) data_x = data_x.transpose().astype(np.float)[:,:] print(data_x.shape) # if remove_outliers: # data_x = pickle.load(open('./missing_data.sav', 'rb')) # data_x = data_x.transpose().astype(np.float) # else: # data_x = pickle.load(open('./denoised_missing_data.sav', 'rb')) signed_variables = ['base_excess'] no, dim = data_x.shape data_x_encoded = np.copy(data_x) miss_data_x = np.copy(data_x) miss_data_x_enc = np.copy(data_x) scalers = [] for i in range(0, dim): variable, var_x = variables[i], np.copy(data_x[:,i]) encoder_model = encoders[i] # Exclude outliers based on error nn_indices = ~np.isnan(data_x_encoded[:,i]) nn_values = data_x[:,i][nn_indices] scaler = MinMaxScaler() var_x_scaled = scaler.fit_transform(var_x.reshape((-1,1))) enc_x_scaled = encoder_model.predict(var_x_scaled) enc_x_unscaled = scaler.inverse_transform(enc_x_scaled) data_x_encoded[:,i] = enc_x_unscaled.flatten() scalers.append(scaler) if remove_outliers: print('Excluding outliers...') mse = np.mean(np.power(var_x.reshape((-1,1)) - enc_x_unscaled, 2),axis=1) x = np.ma.array(mse, mask=np.isnan(mse)) y = np.ma.array(var_x, mask=np.isnan(var_x)) outlier_indices = (x / np.max(y)) > 2 outlier_values = var_x[outlier_indices] print('... %d outlier(s) excluded' % \ len(outlier_values), outlier_values) miss_data_x[outlier_indices == True,i] = np.nan miss_data_x_enc[outlier_indices == True,i] = np.nan #print(var_x, '----', enc_x_scaled, '----', enc_x_unscaled.flatten()) print('Loaded model for %s...' % variable) no_total = no * dim no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True) no_not_nan = no_total - no_nan print('Input shape', no, 'x', dim) print('NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) n_patients = int(no/n_time_points) if len(variables) != dim: print(len(variables), dim) print('Incompatible dimensions.') exit() if enable_transform: print('Applying transformation...') transformer = MinMaxScaler() transformer.fit(data_x) #data_x = transformer.transform(data_x) #miss_data_x = transformer.transform(miss_data_x) miss_data_x_enc = transformer.transform(data_x_encoded) # Introduce missing data data_m = binary_sampler(1-miss_rate, no, dim) miss_data_x[data_m == 0] = np.nan miss_data_x_enc[data_m == 0] = np.nan no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True) no_not_nan = no_total - no_nan print('After removal, NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) real_miss_rate = (no_nan / no_total * 100) imputed_data_x_gan = gain( miss_data_x_enc, gain_parameters) # n_gans = 3 # idxg_combined = [] # # for n_gan in range(0, n_gans): # np.random.seed(n_gan + 1) # idxg_combined.append(gain(miss_data_x_enc, gain_parameters)) # # idxg_combined = np.concatenate(idxg_combined) # # idxg_combined_final = gain( # miss_data_x_enc, gain_parameters) # # for j in range(0, dim): # idxg_combined_tmp = np.copy(idxg_combined) # # for i in range(0, n_patients * n_time_points): # if np.isnan(miss_data_x[i,j]) and data_m[i,j] != 0: # idxg_combined_tmp[i,j] = np.nan # # imputer = IterativeImputer() # KNNImputer(n_neighbors=5) # idxg_knn = imputer.fit_transform(idxg_combined_tmp) # idxg_combined_final[:,j] = idxg_knn[0:n_patients*n_time_points,j] # print('Done KNN imputation #%d' % j) # # imputed_data_x_gan = idxg_combined_final imputer = KNNImputer(n_neighbors=5) imputed_data_x_knn = imputer.fit_transform(miss_data_x) imputer = IterativeImputer() imputed_data_x_mice = imputer.fit_transform(miss_data_x) if enable_transform: #data_x = transformer.inverse_transform(data_x) #miss_data_x = transformer.inverse_transform(miss_data_x) imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan) #imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn) #imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice) # Save imputed data to disk pickle.dump(imputed_data_x_gan,open('./filled_data.sav', 'wb')) # Get residuals for computation of stats distances_gan = np.zeros((dim, n_time_points*n_patients)) distances_knn = np.zeros((dim, n_time_points*n_patients)) distances_mice = np.zeros((dim, n_time_points*n_patients)) for i in range(0, n_patients): for j in range(0, dim): variable_name = variables[j] i_start = int(i*n_time_points) i_stop = int(i*n_time_points+n_time_points) original_tuple = data_x[i_start:i_stop,j] corrupted_tuple = miss_data_x[i_start:i_stop,j] imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop,j] imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop,j] imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop,j] if i == 1 or i == 2: print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn) for k in range(0, n_time_points): a, b, c, d = original_tuple[k], imputed_tuple_gan[k], imputed_tuple_knn[k], imputed_tuple_mice[k] if np.isnan(a) or data_m[i_start+k,j] != 0: continue if i % 10 == 0: print(variable_name, a,b,c,d, b-a) distances_gan[j,i*k] = (b - a) distances_knn[j,i*k] = c - a distances_mice[j,i*k] = d - a # Compute distance statistics rrmses_gan, mean_biases, median_biases, bias_cis = [], [], [], [] rrmses_knn, mean_biases_knn, median_biases_knn, bias_cis_knn = [], [], [], [] for j in range(0, dim): # Stats for original data dim_mean = np.mean([x for x in data_x[:,j] if not np.isnan(x)]) dim_max = np.max([x for x in data_x[:,j] if not np.isnan(x)]) dists_gan = distances_gan[j] dists_knn = distances_knn[j] dists_mice = distances_mice[j] #dists_gan /= dim_max #dists_knn /= dim_max #dists_mice /= dim_max # Stats for GAN mean_bias = np.round(np.mean(dists_gan), 4) median_bias = np.round(np.median(dists_gan), 4) mean_ci_95 = mean_confidence_interval(dists_gan) rmse = np.sqrt(np.mean(dists_gan**2)) rrmse = np.round(rmse / dim_mean * 100, 2) bias_cis.append([mean_ci_95[1], mean_ci_95[2]]) mean_biases.append(mean_bias) median_biases.append(median_bias) rrmses_gan.append(rrmse) # Stats for KNN rmse_knn = np.sqrt(np.mean(dists_knn**2)) rrmses_knn = np.round(rmse_knn / dim_mean * 100, 2) # Stats for MICE rmse_mice = np.sqrt(np.mean(dists_mice**2)) rrmses_mice = np.round(rmse_mice / dim_mean * 100, 2) print(variables[j], ' - rrmse: ', rrmse, 'median bias: %.2f' % median_bias, '%%, bias: %.2f (95%% CI, %.2f to %.2f)' % mean_ci_95) n_fig_rows = 6 n_fig_cols = 6 n_fig_total = n_fig_rows * n_fig_cols if dim > n_fig_total: print('Warning: not all variables plotted') fig, axes = plt.subplots(\ n_fig_rows, n_fig_cols, figsize=(15,15)) fig2, axes2 = plt.subplots(\ n_fig_rows, n_fig_cols, figsize=(15,15)) for j in range(0, dim): ax_title = variables[j] ax = axes[int(j/n_fig_cols), j % n_fig_cols] ax2 = axes2[int(j/n_fig_cols), j % n_fig_cols] ax.set_title(ax_title,fontdict={'fontsize':6}) input_arrays = [data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice] output_arrays = [ np.asarray([input_arr[ii,j] for ii in range(0, no) if \ (not np.isnan(data_x[ii,j]) and \ data_m[ii,j] == 0)]) for input_arr in input_arrays ] deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays # Make KDE low_ci, high_ci = bias_cis[j] xlabel = 'mean bias = %.2f (95%% CI, %.2f to %.2f)' % \ (mean_biases[j], low_ci, high_ci) ax.set_xlabel(xlabel, fontsize=6) ax.set_ylabel('$p(x)$',fontsize=6) range_arrays = np.concatenate([deleted_values, imputed_values_gan]) x_range = (np.min(range_arrays), np.min([ np.mean(range_arrays) + 3 * np.std(range_arrays), np.max(range_arrays) ]) ) kde_kws = { 'shade': False, 'bw':'scott', 'clip': x_range } sns.distplot(imputed_values_gan, hist=False, kde_kws={**{ 'color': 'r'}, **kde_kws}, ax=ax) sns.distplot(imputed_values_knn, hist=False, kde_kws={**{ 'color': 'b', 'alpha': 0.5 }, **kde_kws},ax=ax) sns.distplot(imputed_values_mice, hist=False, kde_kws={**{ 'color': 'g', 'alpha': 0.5 }, **kde_kws},ax=ax) sns.distplot(deleted_values, hist=False, kde_kws={**{ 'color': '#000000'}, **kde_kws},ax=ax) # Make QQ plot qqplot(deleted_values, imputed_values_gan, ax=ax2, color='r') qqplot(deleted_values, imputed_values_knn, ax=ax2, color='b') qqplot(deleted_values, imputed_values_mice, ax=ax2, color='g') top_title = 'KDE plot of original data (black) and data imputed using GAN (red) and KNN (blue)' fig.suptitle(top_title, fontsize=8) fig.legend(labels=['GAN', 'KNN', 'MICE', 'Observed']) fig.tight_layout(rect=[0,0.03,0,1.25]) fig.subplots_adjust(hspace=1, wspace=0.35) top_title = 'Q-Q plot of observed vs. predicted values' fig2.suptitle(top_title, fontsize=8) fig2.tight_layout(rect=[0,0.03,0,1.25]) fig2.subplots_adjust(hspace=1, wspace=0.35) plt.show() print() mrrmse_gan = np.round(np.asarray(rrmses_gan).mean(), 2) print('Average RMSE (GAN): ', mrrmse_gan, '%') print() mrrmse_knn = np.round(np.asarray(rrmses_knn).mean(), 2) print('Average RMSE (KNN): ', mrrmse_knn, '%') print() mrrmse_mice = np.round(np.asarray(rrmses_mice).mean(), 2) print('Average RMSE (MICE): ', mrrmse_mice, '%') return real_miss_rate, mrrmse_gan, mrrmse_knn, mrrmse_mice