def gain (data_x, gain_parameters): # Define mask matrix data_m = 1-np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] iterations = gain_parameters['iterations'] # Hyperparameters alpha, beta, delta, gamma = 10, 0.01, 0.1, 0.5 # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders X_dim = 99 z_dim = 60 noise_factor = 0.25 dropout = tf.placeholder(tf.int32, shape = [1]) # Data vector X = tf.placeholder(tf.float32, shape = [None, dim]) z = tf.placeholder(tf.float32, shape=[None, z_dim]) # Encoded vector X_e = tf.placeholder(tf.float32, shape=(None, dim)) # Mask vector M = tf.placeholder(tf.float32, shape = [None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape = [None, dim]) """ Q(X|z) """ Q_W1 = tf.Variable(xavier_init([dim, h_dim])) Q_b1 = tf.Variable(tf.zeros(shape=[h_dim])) Q_W2_mu = tf.Variable(xavier_init([h_dim, z_dim])) Q_b2_mu = tf.Variable(tf.zeros(shape=[z_dim])) Q_W2_sigma = tf.Variable(xavier_init([h_dim, z_dim])) Q_b2_sigma = tf.Variable(tf.zeros(shape=[z_dim])) """ P(X|z) """ P_W1 = tf.Variable(xavier_init([z_dim, h_dim])) P_b1 = tf.Variable(tf.zeros(shape=[h_dim])) P_W2 = tf.Variable(xavier_init([h_dim, X_dim])) P_b2 = tf.Variable(tf.zeros(shape=[X_dim])) theta_E = [Q_W1, Q_b1, Q_W2_mu, Q_b2_mu, Q_W2_sigma, Q_b2_sigma, P_W1, P_b1, P_W2, P_b2] # Discriminator variables D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape = [h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape = [h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape = [dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim*2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape = [h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape = [h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape = [dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## VAE functions def encoder(X): h = tf.nn.relu(tf.matmul(X, Q_W1) + Q_b1) z_mu = tf.matmul(h, Q_W2_mu) + Q_b2_mu z_logvar = tf.matmul(h, Q_W2_sigma) + Q_b2_sigma return z_mu, z_logvar def decoder(z): h = tf.nn.relu(tf.matmul(z, P_W1) + P_b1) logits = tf.matmul(h, P_W2) + P_b2 prob = tf.nn.sigmoid(logits) return prob, logits def sample_z(mu, log_var): eps = tf.random_normal(shape=tf.shape(mu)) return mu + tf.exp(log_var / 2) * eps ## GAIN functions # Generator def generator(x,m, use_dropout): # Concatenate Mask and Data inputs = tf.concat(values = [x, m], axis = 1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) if use_dropout: G_h1 = tf.nn.dropout(G_h1, rate=0.5) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) if use_dropout: G_h2 = tf.nn.dropout(G_h2, rate=0.5) G_h3 = tf.matmul(G_h2, G_W3) + G_b3 G_prob = tf.nn.sigmoid(G_h3) return G_prob # Discriminator def discriminator(x, h, use_dropout): # Concatenate Data and Hint inputs = tf.concat(values = [x, h], axis = 1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) if use_dropout: D_h1 = tf.nn.dropout(D_h1, rate=0.5) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) if use_dropout: D_h2 = tf.nn.dropout(D_h2, rate=0.5) D_h3 = tf.matmul(D_h2, D_W3) + D_b3 #D_prob = tf.nn.sigmoid(D_h3) return D_h3 if tf.reduce_sum(dropout) == 1: use_dropout = True else: use_dropout = False noise_factor = 0 # Encoder X_noise = X + noise_factor * tf.random_normal(tf.shape(X)) X_noise = tf.clip_by_value(X_noise, 0., 1.) z_mu, z_logvar = encoder(X_noise) z_sample = sample_z(z_mu, z_logvar) X_e, logits = decoder(z_sample) # E[log P(X|z)] recon_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=X), 1) # D_KL(Q(z|X_noise) || P(z|X)); calculate in closed form as both dist. are Gaussian kl_loss = gamma * tf.reduce_sum(tf.exp(z_logvar) + z_mu**2 - 1. - z_logvar, 1) # VAE loss # Generator G_sample = generator(X, M, use_dropout) G_sample_reg = generator(X_e, M, use_dropout) # Combine with observed data Hat_X = X * M + G_sample * (1-M) Hat_X_reg = X * M + G_sample_reg * (1-M) # Discriminator D_prob = discriminator(Hat_X, H, use_dropout) D_prob_reg = tf.nn.sigmoid(discriminator(Hat_X_reg, H, use_dropout)) ## GAIN loss E_loss_temp = tf.reduce_mean(recon_loss) * beta + \ tf.reduce_mean(tf.math.log(D_prob_reg + 1e-8)) D_loss_temp = -tf.reduce_mean(M * D_prob + (1-M) * (1-D_prob)) G_loss_temp = -tf.reduce_mean((1-M) * D_prob) X_true = M * X X_pred = M * G_sample MSE_loss = tf.reduce_mean(tf.math.abs(M * X - M * G_sample)) / tf.reduce_mean(M) Hu_loss = tf.reduce_mean(tf.keras.losses.Huber()(X_true, X_pred)) KL_loss = tf.reduce_mean(tf.keras.losses.kullback_leibler_divergence(X_true, X_pred)) D_loss = D_loss_temp # + 0.001 * G_loss_temp G_loss = G_loss_temp + E_loss_temp + alpha * MSE_loss #+ delta * tf.math.abs(KL_loss) E_loss = E_loss_temp #+ 0.001 * G_loss_temp ## GAIN solver E_solver = tf.train.AdamOptimizer(learning_rate=0.00005, beta1=0.5).minimize(E_loss, var_list=theta_E) D_solver = tf.train.RMSPropOptimizer(learning_rate=0.000001).minimize(D_loss, var_list=theta_D) G_solver = tf.train.RMSPropOptimizer(learning_rate=0.00002).minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) losses = {'E': [], 'D': [], 'G': [], 'K-L': [], 'MSE': [], 'Hu': []} dropout = tf.constant(1) # Start Iterations for it in tqdm(range(iterations)): # Get batch coordinates batch_idx = sample_batch_index(no, batch_size) # Get (normalized) data at coordinates X_mb = norm_data_x[batch_idx, :] # Get auxiliary (missingness) matrix M_mb = data_m[batch_idx, :] # Generate a random normal distribution (batch_size X dim) Z_mb = uniform_sampler(0, 0.01, batch_size, dim, True) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Mask * Data + (1- Mask) * Random X_mb = M_mb * X_mb + (1-M_mb) * Z_mb _, E_loss_curr = sess.run([E_solver, E_loss_temp], feed_dict={M: M_mb, X: X_mb, H: H_mb }) _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict = {M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr, KL_loss_curr, Hu_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss, KL_loss, Hu_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb }) if it % 20 == 0: losses['E'].append(E_loss_curr) losses['D'].append(D_loss_curr) losses['G'].append(G_loss_curr * 5) losses['MSE'].append(MSE_loss_curr * alpha) print('Iteration: %d, encoder: %.3f, discriminator: %.3f, generator: %.3f, MSE: %.3f' % (it, E_loss_curr, D_loss_curr, G_loss_curr, MSE_loss_curr)) if MSE_loss_curr < 0.019: break ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim, False) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1-M_mb) * Z_mb dropout = tf.constant(0) imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb })[0] imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) import matplotlib.pyplot as plt plt.title('Encoder, generator, and discriminator losses over time') plt.plot(losses['E'], label='Encoder', lw=2, alpha=0.5) plt.plot(losses['G'], label='Generator', lw=2, alpha=0.5) plt.plot(losses['D'], label='Discriminator', lw=2, alpha=0.5) #plt.plot(losses['K-L'], label='K-L', lw=1) plt.plot(losses['MSE'], label='MSE', lw=2, alpha=0.5) #plt.plot(losses['Hu'], label='Huber', lw=1) plt.xlabel('$Number of training epochs$',fontsize=6) plt.legend() ax = plt.gca() plt.show() return imputed_data
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector X = tf.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) #D_h2 = tf.nn.dropout(D_h2, rate=0.3) D_prob = tf.nn.sigmoid() return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) X_true = M * X X_pred = M * G_sample MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) Hu_loss = tf.reduce_mean(tf.keras.losses.Huber()(X_true, X_pred)) KL_loss = tf.reduce_mean( tf.keras.losses.kullback_leibler_divergence(X_true, X_pred)) D_loss = D_loss_temp alpha, beta, delta = 5, 0.05, 10 ### Extract G_loss = G_loss_temp + alpha * MSE_loss + beta * KL_loss #.sqrt(MSE_loss) ## GAIN solver D_solver = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.1).minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.1).minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) losses = {'D': [], 'G': [], 'K-L': [], 'MSE': [], 'Hu': []} # Start Iterations for it in tqdm(range(iterations)): # Get batch coordinates batch_idx = sample_batch_index(no, batch_size) # Get (normalized) data at coordinates X_mb = norm_data_x[batch_idx, :] # Get auxiliary (missingness) matrix M_mb = data_m[batch_idx, :] # Generate a random normal distribution (batch_size X dim) Z_mb = uniform_sampler(0, 0.01, batch_size, dim) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Mask * Data + (1- Mask) * Random X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr, KL_loss_curr, Hu_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss, KL_loss, Hu_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) #if int(MSE_loss_curr * 1000) % 10 == 0: losses['D'].append(D_loss_curr) losses['G'].append(G_loss_curr) losses['K-L'].append(KL_loss_curr * beta) losses['MSE'].append(MSE_loss_curr * alpha) losses['Hu'].append(Hu_loss_curr * delta) print(it, G_loss_curr - MSE_loss_curr * alpha - KL_loss_curr * beta, MSE_loss_curr * alpha, KL_loss_curr * beta, G_loss_curr, MSE_loss_curr) if MSE_loss_curr < 0.01: break import matplotlib.pyplot as plt plt.plot(losses['D'], label='discriminator', lw=1) plt.plot(losses['G'], label='generator', lw=1) plt.plot(losses['K-L'], label='K-L', lw=1) plt.plot(losses['MSE'], label='MSE', lw=1) plt.plot(losses['Hu'], label='Huber', lw=1) plt.legend() plt.show() ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data
def main (alpha=1000, batch_size=128, hint_rate=0.5, iterations=900, miss_rate=0.3): gain_parameters = {'batch_size': batch_size, 'hint_rate': hint_rate, 'alpha': alpha, 'iterations': iterations} # Load data and introduce missingness #file_name = 'data/spam.csv' #data_x = np.loadtxt(file_name, delimiter=",", skiprows=1) enable_transform = False remove_outliers = False n_time_points = 3 data_x = pickle.load(open('./missing_data.sav', 'rb')) data_x = data_x.transpose().astype(np.float)[:,:] print(data_x.shape) # if remove_outliers: # data_x = pickle.load(open('./missing_data.sav', 'rb')) # data_x = data_x.transpose().astype(np.float) # else: # data_x = pickle.load(open('./denoised_missing_data.sav', 'rb')) signed_variables = ['base_excess'] no, dim = data_x.shape data_x_encoded = np.copy(data_x) miss_data_x = np.copy(data_x) miss_data_x_enc = np.copy(data_x) scalers = [] for i in range(0, dim): variable, var_x = variables[i], np.copy(data_x[:,i]) encoder_model = encoders[i] # Exclude outliers based on error nn_indices = ~np.isnan(data_x_encoded[:,i]) nn_values = data_x[:,i][nn_indices] scaler = MinMaxScaler() var_x_scaled = scaler.fit_transform(var_x.reshape((-1,1))) enc_x_scaled = encoder_model.predict(var_x_scaled) enc_x_unscaled = scaler.inverse_transform(enc_x_scaled) data_x_encoded[:,i] = enc_x_unscaled.flatten() scalers.append(scaler) if remove_outliers: print('Excluding outliers...') mse = np.mean(np.power(var_x.reshape((-1,1)) - enc_x_unscaled, 2),axis=1) x = np.ma.array(mse, mask=np.isnan(mse)) y = np.ma.array(var_x, mask=np.isnan(var_x)) outlier_indices = (x / np.max(y)) > 2 outlier_values = var_x[outlier_indices] print('... %d outlier(s) excluded' % \ len(outlier_values), outlier_values) miss_data_x[outlier_indices == True,i] = np.nan miss_data_x_enc[outlier_indices == True,i] = np.nan #print(var_x, '----', enc_x_scaled, '----', enc_x_unscaled.flatten()) print('Loaded model for %s...' % variable) no_total = no * dim no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True) no_not_nan = no_total - no_nan print('Input shape', no, 'x', dim) print('NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) n_patients = int(no/n_time_points) if len(variables) != dim: print(len(variables), dim) print('Incompatible dimensions.') exit() if enable_transform: print('Applying transformation...') transformer = MinMaxScaler() transformer.fit(data_x) #data_x = transformer.transform(data_x) #miss_data_x = transformer.transform(miss_data_x) miss_data_x_enc = transformer.transform(data_x_encoded) # Introduce missing data data_m = binary_sampler(1-miss_rate, no, dim) miss_data_x[data_m == 0] = np.nan miss_data_x_enc[data_m == 0] = np.nan no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True) no_not_nan = no_total - no_nan print('After removal, NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) real_miss_rate = (no_nan / no_total * 100) imputed_data_x_gan = gain( miss_data_x_enc, gain_parameters) # n_gans = 3 # idxg_combined = [] # # for n_gan in range(0, n_gans): # np.random.seed(n_gan + 1) # idxg_combined.append(gain(miss_data_x_enc, gain_parameters)) # # idxg_combined = np.concatenate(idxg_combined) # # idxg_combined_final = gain( # miss_data_x_enc, gain_parameters) # # for j in range(0, dim): # idxg_combined_tmp = np.copy(idxg_combined) # # for i in range(0, n_patients * n_time_points): # if np.isnan(miss_data_x[i,j]) and data_m[i,j] != 0: # idxg_combined_tmp[i,j] = np.nan # # imputer = IterativeImputer() # KNNImputer(n_neighbors=5) # idxg_knn = imputer.fit_transform(idxg_combined_tmp) # idxg_combined_final[:,j] = idxg_knn[0:n_patients*n_time_points,j] # print('Done KNN imputation #%d' % j) # # imputed_data_x_gan = idxg_combined_final imputer = KNNImputer(n_neighbors=5) imputed_data_x_knn = imputer.fit_transform(miss_data_x) imputer = IterativeImputer() imputed_data_x_mice = imputer.fit_transform(miss_data_x) if enable_transform: #data_x = transformer.inverse_transform(data_x) #miss_data_x = transformer.inverse_transform(miss_data_x) imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan) #imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn) #imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice) # Save imputed data to disk pickle.dump(imputed_data_x_gan,open('./filled_data.sav', 'wb')) # Get residuals for computation of stats distances_gan = np.zeros((dim, n_time_points*n_patients)) distances_knn = np.zeros((dim, n_time_points*n_patients)) distances_mice = np.zeros((dim, n_time_points*n_patients)) for i in range(0, n_patients): for j in range(0, dim): variable_name = variables[j] i_start = int(i*n_time_points) i_stop = int(i*n_time_points+n_time_points) original_tuple = data_x[i_start:i_stop,j] corrupted_tuple = miss_data_x[i_start:i_stop,j] imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop,j] imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop,j] imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop,j] if i == 1 or i == 2: print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn) for k in range(0, n_time_points): a, b, c, d = original_tuple[k], imputed_tuple_gan[k], imputed_tuple_knn[k], imputed_tuple_mice[k] if np.isnan(a) or data_m[i_start+k,j] != 0: continue if i % 10 == 0: print(variable_name, a,b,c,d, b-a) distances_gan[j,i*k] = (b - a) distances_knn[j,i*k] = c - a distances_mice[j,i*k] = d - a # Compute distance statistics rrmses_gan, mean_biases, median_biases, bias_cis = [], [], [], [] rrmses_knn, mean_biases_knn, median_biases_knn, bias_cis_knn = [], [], [], [] for j in range(0, dim): # Stats for original data dim_mean = np.mean([x for x in data_x[:,j] if not np.isnan(x)]) dim_max = np.max([x for x in data_x[:,j] if not np.isnan(x)]) dists_gan = distances_gan[j] dists_knn = distances_knn[j] dists_mice = distances_mice[j] #dists_gan /= dim_max #dists_knn /= dim_max #dists_mice /= dim_max # Stats for GAN mean_bias = np.round(np.mean(dists_gan), 4) median_bias = np.round(np.median(dists_gan), 4) mean_ci_95 = mean_confidence_interval(dists_gan) rmse = np.sqrt(np.mean(dists_gan**2)) rrmse = np.round(rmse / dim_mean * 100, 2) bias_cis.append([mean_ci_95[1], mean_ci_95[2]]) mean_biases.append(mean_bias) median_biases.append(median_bias) rrmses_gan.append(rrmse) # Stats for KNN rmse_knn = np.sqrt(np.mean(dists_knn**2)) rrmses_knn = np.round(rmse_knn / dim_mean * 100, 2) # Stats for MICE rmse_mice = np.sqrt(np.mean(dists_mice**2)) rrmses_mice = np.round(rmse_mice / dim_mean * 100, 2) print(variables[j], ' - rrmse: ', rrmse, 'median bias: %.2f' % median_bias, '%%, bias: %.2f (95%% CI, %.2f to %.2f)' % mean_ci_95) n_fig_rows = 6 n_fig_cols = 6 n_fig_total = n_fig_rows * n_fig_cols if dim > n_fig_total: print('Warning: not all variables plotted') fig, axes = plt.subplots(\ n_fig_rows, n_fig_cols, figsize=(15,15)) fig2, axes2 = plt.subplots(\ n_fig_rows, n_fig_cols, figsize=(15,15)) for j in range(0, dim): ax_title = variables[j] ax = axes[int(j/n_fig_cols), j % n_fig_cols] ax2 = axes2[int(j/n_fig_cols), j % n_fig_cols] ax.set_title(ax_title,fontdict={'fontsize':6}) input_arrays = [data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice] output_arrays = [ np.asarray([input_arr[ii,j] for ii in range(0, no) if \ (not np.isnan(data_x[ii,j]) and \ data_m[ii,j] == 0)]) for input_arr in input_arrays ] deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays # Make KDE low_ci, high_ci = bias_cis[j] xlabel = 'mean bias = %.2f (95%% CI, %.2f to %.2f)' % \ (mean_biases[j], low_ci, high_ci) ax.set_xlabel(xlabel, fontsize=6) ax.set_ylabel('$p(x)$',fontsize=6) range_arrays = np.concatenate([deleted_values, imputed_values_gan]) x_range = (np.min(range_arrays), np.min([ np.mean(range_arrays) + 3 * np.std(range_arrays), np.max(range_arrays) ]) ) kde_kws = { 'shade': False, 'bw':'scott', 'clip': x_range } sns.distplot(imputed_values_gan, hist=False, kde_kws={**{ 'color': 'r'}, **kde_kws}, ax=ax) sns.distplot(imputed_values_knn, hist=False, kde_kws={**{ 'color': 'b', 'alpha': 0.5 }, **kde_kws},ax=ax) sns.distplot(imputed_values_mice, hist=False, kde_kws={**{ 'color': 'g', 'alpha': 0.5 }, **kde_kws},ax=ax) sns.distplot(deleted_values, hist=False, kde_kws={**{ 'color': '#000000'}, **kde_kws},ax=ax) # Make QQ plot qqplot(deleted_values, imputed_values_gan, ax=ax2, color='r') qqplot(deleted_values, imputed_values_knn, ax=ax2, color='b') qqplot(deleted_values, imputed_values_mice, ax=ax2, color='g') top_title = 'KDE plot of original data (black) and data imputed using GAN (red) and KNN (blue)' fig.suptitle(top_title, fontsize=8) fig.legend(labels=['GAN', 'KNN', 'MICE', 'Observed']) fig.tight_layout(rect=[0,0.03,0,1.25]) fig.subplots_adjust(hspace=1, wspace=0.35) top_title = 'Q-Q plot of observed vs. predicted values' fig2.suptitle(top_title, fontsize=8) fig2.tight_layout(rect=[0,0.03,0,1.25]) fig2.subplots_adjust(hspace=1, wspace=0.35) plt.show() print() mrrmse_gan = np.round(np.asarray(rrmses_gan).mean(), 2) print('Average RMSE (GAN): ', mrrmse_gan, '%') print() mrrmse_knn = np.round(np.asarray(rrmses_knn).mean(), 2) print('Average RMSE (KNN): ', mrrmse_knn, '%') print() mrrmse_mice = np.round(np.asarray(rrmses_mice).mean(), 2) print('Average RMSE (MICE): ', mrrmse_mice, '%') return real_miss_rate, mrrmse_gan, mrrmse_knn, mrrmse_mice
def main(iterations=NUM_ITERATIONS, batch_size=128, hint_rate=0.5, miss_rate=0.3): gain_parameters = { 'batch_size': batch_size, 'hint_rate': hint_rate, 'iterations': iterations } enable_transform = False remove_outliers = False n_time_points = 3 data_x = pickle.load(open('./missing_data.sav', 'rb')) data_x = data_x.transpose().astype(np.float)[:, :] # Remove variables with more no, dim = data_x.shape removed = 0 for d in range(0, dim): if variables[d - removed] in remove_variables: variables.remove(variables[d - removed]) data_x = np.delete(data_x, d - removed, axis=1) removed += 1 no, dim = data_x.shape if len(variables) != dim: print(len(variables), dim) print('Incompatible dimensions.') exit() no_total = no * dim no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True) no_not_nan = no_total - no_nan n_patients = int(no / n_time_points) miss_data_x = np.copy(data_x) print('Input shape', no, 'x', dim) print('NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) # Introduce missing data data_m = binary_sampler(1 - miss_rate, no, dim) miss_data_x[data_m == 0] = np.nan transformer = RobustScaler() miss_data_x = transformer.fit_transform(miss_data_x) no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True) no_not_nan = no_total - no_nan print('After removal, NAN values:', no_nan, '/', no_total, \ '%2.f%%' % (no_nan / no_total * 100)) real_miss_rate = (no_nan / no_total * 100) miss_data_x_gan_tmp = np.zeros((n_patients, dim * n_time_points)) # Swap (one row per time point) to (one column per time point) for i in range(0, n_patients): for j in range(0, dim): for n in range(0, n_time_points): miss_data_x_gan_tmp[i, n * dim + j] = miss_data_x[i * n_time_points + n, j] imputed_data_x_gan_tmp = gain(miss_data_x_gan_tmp, gain_parameters) imputed_data_x_gan = np.copy(miss_data_x) ## Swap (one column per time point) to (one row per time point) for i in range(0, n_patients): for j in range(0, dim): for n in range(0, n_time_points): imputed_data_x_gan[i * n_time_points + n, j] = imputed_data_x_gan_tmp[i, n * dim + j] imputer = KNNImputer(n_neighbors=5) imputed_data_x_knn = imputer.fit_transform(miss_data_x) imputer = IterativeImputer(verbose=True) imputed_data_x_mice = imputer.fit_transform(miss_data_x) imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan) imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn) imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice) # Save imputed data to disk pickle.dump(imputed_data_x_gan, open('./filled_data.sav', 'wb')) # Get residuals for computation of stats distances_gan = np.zeros((dim, n_time_points * n_patients)) distances_knn = np.zeros((dim, n_time_points * n_patients)) distances_mice = np.zeros((dim, n_time_points * n_patients)) distributions = {'deleted': [], 'gan': [], 'knn': [], 'mice': []} from scipy.stats import iqr for j in range(0, dim): nn_values = data_x[:, j].flatten() nn_values = nn_values[~np.isnan(nn_values)] dim_iqr = np.mean(nn_values) # iqr(nn_values) for i in range(0, n_patients): variable_name = variables[j] i_start = int(i * n_time_points) i_stop = int(i * n_time_points + n_time_points) original_tuple = data_x[i_start:i_stop, j] corrupted_tuple = miss_data_x[i_start:i_stop, j] imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop, j] imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop, j] imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop, j] #if i == 1 or i == 2: # print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn) for k in range(0, n_time_points): a, b, c, d = original_tuple[k], imputed_tuple_gan[k], \ imputed_tuple_knn[k], imputed_tuple_mice[k] if np.isnan(a) or data_m[i_start + k, j] != 0: continue #if i % 10 == 0: print(variable_name, a,b,c,d, b-a) distances_gan[j, i * k] = (b - a) distances_knn[j, i * k] = (c - a) distances_mice[j, i * k] = (d - a) # Compute distance statistics all_stats = {} for j in range(0, dim): print('%d. Imputed variable: %s' % (j, variables[j])) current_stats = {'gan': {}, 'knn': {}, 'mice': {}} # make a copy # Stats for original data dim_mean = np.mean([x for x in data_x[:, j] if not np.isnan(x)]) dim_max = np.max([x for x in data_x[:, j] if not np.isnan(x)]) dim_iqr = iqr([x for x in data_x[:, j] if not np.isnan(x)]) # Indices for removed data ind = (data_m[:, j] == 0).flatten() & (~np.isnan(data_x[:, j])).flatten() # Stats for GAN current_stats['gan']['bias'] = np.mean(distances_gan[j]) current_stats['gan']['rmse'] = np.sqrt(np.mean(distances_gan[j]**2)) current_stats['gan']['nrmse'] = current_stats['gan']['rmse'] / dim_iqr current_stats['gan']['mape'] = np.mean(np.abs(distances_gan[j])) current_stats['gan']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_gan[ind, j].flatten()) # Stats for KNN current_stats['knn']['bias'] = np.mean(distances_knn[j]) current_stats['knn']['rmse'] = np.sqrt(np.mean(distances_knn[j]**2)) current_stats['knn']['nrmse'] = current_stats['knn']['rmse'] / dim_iqr current_stats['knn']['mape'] = np.mean(np.abs(distances_knn[j])) current_stats['knn']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_knn[ind, j].flatten()) # Stats for MICE current_stats['mice']['bias'] = np.mean(distances_mice[j]) current_stats['mice']['rmse'] = np.sqrt(np.mean(distances_mice[j]**2)) current_stats['mice'][ 'nrmse'] = current_stats['mice']['rmse'] / dim_iqr current_stats['mice']['mape'] = np.mean(np.abs(distances_mice[j])) current_stats['mice']['wd'] = wasserstein_distance( data_x[ind, j].flatten(), imputed_data_x_mice[ind, j].flatten()) for model_name in current_stats: model = current_stats[model_name] print('... %s - bias: %.3f, RMSE: %.3f, ME: %.3f, WD: %.3f' % \ (model_name, model['bias'], model['rmse'], model['mape'], model['wd'])) all_stats[variables[j]] = dict(current_stats) print() n_fig_rows, n_fig_cols = 6, 6 n_fig_total = n_fig_rows * n_fig_cols if dim > n_fig_total: print('Warning: not all variables plotted') all_fig_axes = [ plt.subplots(n_fig_rows, n_fig_cols, figsize=(15, 15)) for _ in range(0, 3) ] for j in range(0, dim): dim_not_nan = np.count_nonzero(~np.isnan(data_x[:, j])) deleted_no = np.count_nonzero( np.isnan(miss_data_x[:, j]) & ~np.isnan(data_x[:, j])) ax_title = variables[j] + (' (%d of %d observed)' % (deleted_no, dim_not_nan)) dim_axes = [ fig_axes[1][int(j / n_fig_cols), j % n_fig_cols] for fig_axes in all_fig_axes ] [ ax.set_title(ax_title, fontdict={ 'fontsize': 7, 'fontweight': 'bold' }) for ax in dim_axes ] input_arrays = [ data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice ] output_arrays = [ np.asarray([input_arr[ii,j] for ii in range(0, no) if \ (not np.isnan(data_x[ii,j]) and \ data_m[ii,j] == 0)]) for input_arr in input_arrays ] deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays plot_distribution_densities(output_arrays, all_stats, variables[j], dim_axes[0]) plot_distribution_residuals(output_arrays, dim_axes[1]) plot_distribution_summaries(output_arrays, dim_axes[2]) # Make QQ plot of original and deleted values vs. normal distribution #dist_max = np.max(np.concatenate([imputed_values_gan, deleted_values])) #qqplot_1sample((data_x[~np.isnan(data_x[:,j]),j] - dist_min) / dist_max, ax=ax3, color='b') #qqplot_1sample((data_x[data_m[:,j] == 0,j] - dist_min) / dist_max, ax=ax3, color='r',draw_line=False) # Figure 1 fig1 = all_fig_axes[0][0] top_title = 'Kernel density estimation for erased and predicted values, for each imputation method' fig1.suptitle(top_title, fontsize=8) fig1.tight_layout(rect=[0, 0.03, 0, 1.25]) fig1.subplots_adjust(hspace=1, wspace=0.35) # Figure 2 fig2 = all_fig_axes[1][0] top_title = 'Q-Q plot of erased vs. imputed values, for each imputation method' fig2.suptitle(top_title, fontsize=8) fig2.tight_layout(rect=[0, 0.03, 0, 1.25]) fig2.subplots_adjust(hspace=1, wspace=0.35) # Figure 3 fig3 = all_fig_axes[2][0] top_title = 'Bayesian confidence intervals for the mean and standard deviation, for erased values and imputed values' fig3.suptitle(top_title, fontsize=8) fig3.tight_layout(rect=[0, 0.03, 0, 1.25]) fig3.subplots_adjust(hspace=1, wspace=0.35) # Figure 4 fig5, ax5 = plt.subplots(1, 1) top_title = 'Distribution of normalized RMSEs for each imputation method' fig5.suptitle(top_title, fontsize=8) plot_error_distributions(all_stats, fig5, ax5) ax5.set_ylabel('Probability density', fontsize=6) ax5.set_xlabel('NRMSE (normalized to IQR)', fontsize=6) ax5.legend(fontsize=6) fig5.tight_layout(rect=[0, 0.03, 0, 1.25]) fig5.subplots_adjust(hspace=1, wspace=0.35) plt.show() for model_name in ['gan', 'knn', 'mice']: wds = [ all_stats[variable_name][model_name]['wd'] for variable_name in all_stats ] nrmses = [ all_stats[variable_name][model_name]['nrmse'] for variable_name in all_stats ] mwd = np.round(np.asarray(wds).mean(), 2) mnrmse = np.round(np.asarray(nrmses).mean(), 2) print('Model: %s - average WD = %.2f, average NRMSE = %.2f ' % (model_name, mwd, mnrmse)) return all_stats
def gain(data_x, gain_parameters): '''Impute missing values in data_x Args: - data_x: original data with missing values - gain_parameters: GAIN network parameters: - batch_size: Batch size - hint_rate: Hint rate - alpha: Hyperparameter - iterations: Iterations Returns: - imputed_data: imputed data ''' # Define mask matrix data_m = 1 - np.isnan(data_x) # System parameters batch_size = gain_parameters['batch_size'] hint_rate = gain_parameters['hint_rate'] alpha = 10 # gain_parameters['alpha'] iterations = gain_parameters['iterations'] # Other parameters no, dim = data_x.shape # Hidden state dimensions h_dim = int(dim) # Normalization norm_data, norm_parameters = normalization(data_x) norm_data_x = np.nan_to_num(norm_data, 0) ## GAIN architecture # Input placeholders # Data vector X = tf.placeholder(tf.float32, shape=[None, dim]) # Mask vector M = tf.placeholder(tf.float32, shape=[None, dim]) # Hint vector H = tf.placeholder(tf.float32, shape=[None, dim]) # Discriminator variables D_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) # Data + Hint as inputs D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) D_W2 = tf.Variable(xavier_init([h_dim, h_dim])) D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) D_W3 = tf.Variable(xavier_init([h_dim, dim])) D_b3 = tf.Variable(tf.zeros(shape=[dim])) # Multi-variate outputs theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] #Generator variables # Data + Mask as inputs (Random noise is in missing components) G_W1 = tf.Variable(xavier_init([dim * 2, h_dim])) G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) G_W2 = tf.Variable(xavier_init([h_dim, h_dim])) G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) G_W3 = tf.Variable(xavier_init([h_dim, dim])) G_b3 = tf.Variable(tf.zeros(shape=[dim])) theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] ## GAIN functions # Generator def generator(x, m): # Concatenate Mask and Data inputs = tf.concat(values=[x, m], axis=1) G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) # MinMax normalized output G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) return G_prob # Discriminator def discriminator(x, h): # Concatenate Data and Hint inputs = tf.concat(values=[x, h], axis=1) D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) D_logit = tf.matmul(D_h2, D_W3) + D_b3 D_prob = tf.nn.sigmoid(D_logit) return D_prob ## GAIN structure # Generator G_sample = generator(X, M) # Combine with observed data Hat_X = X * M + G_sample * (1 - M) # Discriminator D_prob = discriminator(Hat_X, H) ## GAIN loss D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \ + (1-M) * tf.log(1. - D_prob + 1e-8)) G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) MSE_loss = \ tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M) D_loss = D_loss_temp G_loss = G_loss_temp + alpha * MSE_loss ## GAIN solver D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) ## Iterations sess = tf.Session() sess.run(tf.global_variables_initializer()) # Start Iterations for it in tqdm(range(iterations)): # Sample batch batch_idx = sample_batch_index(no, batch_size) X_mb = norm_data_x[batch_idx, :] M_mb = data_m[batch_idx, :] # Sample random vectors Z_mb = uniform_sampler(0, 0.01, batch_size, dim, True) # Sample hint vectors H_mb_temp = binary_sampler(hint_rate, batch_size, dim) H_mb = M_mb * H_mb_temp # Combine random vectors with observed vectors X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb _, D_loss_curr = sess.run([D_solver, D_loss_temp], feed_dict={ M: M_mb, X: X_mb, H: H_mb }) _, G_loss_curr, MSE_loss_curr = \ sess.run([G_solver, G_loss_temp, MSE_loss], feed_dict = {X: X_mb, M: M_mb, H: H_mb}) ## Return imputed data Z_mb = uniform_sampler(0, 0.01, no, dim, True) M_mb = data_m X_mb = norm_data_x X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0] imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data # Renormalization imputed_data = renormalization(imputed_data, norm_parameters) # Rounding imputed_data = rounding(imputed_data, data_x) return imputed_data