def completion(self, x, mask, vae,): ''' function to generate new samples conditioned on observations :param x: underlying partially observed data :param mask: mask of missingness :param M: number of MC samples :param cat_dims: a list that indicates the number of potential outcomes for non-continuous variables. :param dic_var_type: a list that indicates the whether a variable is continuous. :param vae: a pre-trained vae. :param list_discrete: list of discrete variables :return: sampled missing data, a M by N by D matrix, where M is the number of samples. ''' ## decompress mask mask_flt = mask[:, np.ndarray.flatten(np.argwhere(self._dic_var_type == 0))] mask_cat_oh = np.array([]).reshape(x.shape[0], 0) for d in range(len(self._cat_dims)): temp = np.ones((x.shape[0], self._cat_dims[d])) temp[mask[:, d] == 0, :] = 0 mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1) mask = np.concatenate([mask_cat_oh, mask_flt ], 1) im = np.zeros((self._M, x.shape[0], x.shape[1])) for m in range(self._M): #tf.reset_default_graph() np.random.seed(42 + m) ### added for bar plots only noisy_samples = vae.im(x, mask) noisy_samples_mix = x*mask + noisy_samples*(1-mask) inverted_samples = process.invert_noise(noisy_samples_mix,self._list_discrete,self._records_d) im[m, :, :] = inverted_samples # im[m,:,:] = noisy_samples_mix return im
def get_imputation(self, x, mask_obs, cat_dims, dic_var_type,): mask_flt = mask_obs[:, np.ndarray.flatten(np.argwhere(dic_var_type == 0))] mask_cat_oh = np.array([]).reshape(x.shape[0], 0) for d in range(len(cat_dims)): temp = np.ones((x.shape[0], cat_dims[d])) temp[mask_obs[:, d] == 0, :] = 0 mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1) mask_obs = np.concatenate([mask_cat_oh,mask_flt], 1) decoded_noisy = self._sesh.run(self.decoded,feed_dict={self.x: x, self.mask: mask_obs,self.x_induce:self._x_train}) z_posterior = self._sesh.run(self.z,feed_dict={self.x: x, self.mask: mask_obs,self.x_induce:self._x_train}) decoded = process.invert_noise(decoded_noisy,self._list_discrete,self._records_d) # revert decode dim_cat = len(np.argwhere(cat_dims != -1)) decoded_cat = decoded[:,0:self._DIM_CAT] decoded_flt = decoded[:,self._DIM_CAT:] decoded_cat_int = np.zeros((decoded.shape[0],dim_cat)) cumsum_cat_dims = np.concatenate( ([0],np.cumsum(cat_dims))) decoded_cat_p = [] for d in range(len(cat_dims)): decoded_cat_int_p = decoded_cat[:,cumsum_cat_dims[d]:cumsum_cat_dims[d+1]] decoded_cat_int_p = decoded_cat_int_p/np.sum(decoded_cat_int_p,1,keepdims=True) if d==0: decoded_cat_p = decoded_cat_int_p else: decoded_cat_p = np.concatenate([decoded_cat_p,decoded_cat_int_p],1) for n in range(decoded.shape[0]): decoded_cat_int[n,d] = np.random.choice(len(decoded_cat_int_p[n,:]), 1 , p=decoded_cat_int_p[n,:]) print(decoded_cat_int[:,d].max()) decoded = np.concatenate((decoded_cat_int,decoded_flt),axis=1) return decoded,z_posterior,decoded_cat_p
def predictive_loss(self, x, mask, cat_dims, dic_var_type, M): ''' This function computes predictive losses (negative llh). This is used for active learning phase. We assume that the last column of x is the target variable of interest :param x: data matrix, the last column of x is the target variable of interest :param mask: mask that indicates observed data and missing data locations :return: MAE and RMSE ''' lh = 0 rmse = 0 ae = 0 uncertainty_data = np.zeros((x.shape[0], M)) # decompress mask mask_flt = mask[:, np.ndarray.flatten(np.argwhere(dic_var_type == 0))] mask_cat_oh = np.array([]).reshape(x.shape[0], 0) for d in range(len(cat_dims)): temp = np.ones((x.shape[0], cat_dims[d])) temp[mask[:, d] == 0, :] = 0 mask_cat_oh = np.concatenate([mask_cat_oh, temp], 1) mask = np.concatenate([mask_cat_oh, mask_flt], 1) auto_std = self._sesh.run(self.auto_std, feed_dict={ self.x: x, self.mask: mask, self.x_induce: self._x_train }) for m in range(M): decoded_noisy = self._sesh.run(self.decoded, feed_dict={ self.x: x, self.mask: mask, self.x_induce: self._x_train }) decoded = process.invert_noise(decoded_noisy, self._list_discrete, self._records_d) target = x[:, -1] output = decoded[:, -1] uncertainty_data[:, m] = decoded[:, -1] lh += np.exp(-0.5 * np.square(target - output) / (np.square(auto_std[:, -1])) - np.log(auto_std[:, -1]) - 0.5 * np.log(2 * np.pi)) rmse += np.sqrt( np.sum(np.square(target - output)) / np.sum(mask.shape[0])) ae += np.abs(target - output) nllh = -np.log(lh / M) rmse /= M ae /= M return nllh, ae
def encode2(data_decode, list_discrete, records_d, fast_plot): # Extracting Masked Decomp Data from data_decode function obtained from load_data function Data_train_decomp, Data_train_noisy_decomp, mask_train_decomp, Data_test_decomp, mask_test_comp, mask_test_decomp, cat_dims, DIM_FLT, dic_var_type = data_decode vae = p_vae_active_learning(Data_train_decomp, Data_train_noisy_decomp, mask_train_decomp, Data_test_decomp, mask_test_comp, mask_test_decomp, cat_dims, DIM_FLT, dic_var_type, args, list_discrete, records_d) x_real = process.compress_data( Data_train_decomp, cat_dims, dic_var_type) ## x_real still needs conversion x_real_cat_p = Data_train_decomp[:, 0:(cat_dims.sum()).astype(int)] tf.reset_default_graph() x_recon, z_posterior, x_recon_cat_p = vae.get_imputation( Data_train_noisy_decomp, mask_train_decomp * 0, cat_dims, dic_var_type) ## one hot already converted to integer max_Data = 0.7 min_Data = 0.3 Data_std = (x_real - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0)) scaling_factor = (x_real.max(axis=0) - x_real.min(axis=0)) / (max_Data - min_Data) Data_real = Data_std * (max_Data - min_Data) + min_Data fast_plot = 1 sub_id = [1, 2, 10] if fast_plot: Data_real = pd.DataFrame(Data_real[:, sub_id]) g = sns.pairplot(Data_real.sample(min(1000, x_real.shape[0])), diag_kind='kde') g = g.map_diag(sns.distplot, bins=50, norm_hist=True) g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data)) else: Data_real = pd.DataFrame(Data_real[:, sub_id]) g = sns.pairplot(Data_real.sample(min(10000, x_real.shape[0])), diag_kind='kde') g = g.map_diag(sns.distplot, bins=50, norm_hist=True) g = g.map_upper(plt.scatter, marker='+') g = g.map_lower(sns.kdeplot, cmap="hot", shade=True, bw=.1) g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data)) Data_fake_noisy = x_recon Data_fake = process.invert_noise(Data_fake_noisy, list_discrete, records_d) Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0)) Data_fake = Data_std * (max_Data - min_Data) + min_Data sub_id = [1, 2, 10] if fast_plot: g = sns.pairplot(pd.DataFrame(Data_fake[:, sub_id]).sample( min(1000, x_real.shape[0])), diag_kind='kde') g = g.map_diag(sns.distplot, bins=50, norm_hist=True) g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data)) else: g = sns.pairplot(pd.DataFrame(Data_fake[:, sub_id]).sample( min(1000, x_real.shape[0])), diag_kind='kde') g = g.map_diag(sns.distplot, bins=50, norm_hist=True) g = g.map_upper(plt.scatter, marker='+') g = g.map_lower(sns.kdeplot, cmap="hot", shade=True, bw=.1) g.set(xlim=(min_Data, max_Data), ylim=(min_Data, max_Data)) return vae, scaling_factor
1.0 2.0 0.0 1.0 76.0 11.0 3.0 7.0 2.0 2.0 2.0 1.0 In [7]: Data_fake_noisy= x_recon Data_fake = process.invert_noise(Data_fake_noisy,list_discrete_compressed,records_d) Data_std = (Data_fake - x_real.min(axis=0)) / (x_real.max(axis=0) - x_real.min(axis=0)) Data_fake = Data_std * (max_Data - min_Data) + min_Data sub_id = [1,2,10] if fast_plot ==1: g = sns.pairplot(pd.DataFrame(Data_fake[:,sub_id]).sample(min(1000,x_real.shape[0])),diag_kind = 'kde') g = g.map_diag(sns.distplot, bins = 50,norm_hist = True) g.set(xlim=(min_Data,max_Data), ylim = (min_Data,max_Data)) else: g = sns.pairplot(pd.DataFrame(Data_fake[:,sub_id]).sample(min(1000,x_real.shape[0])),diag_kind = 'kde') g = g.map_diag(sns.distplot, bins = 50,norm_hist = True) g = g.map_upper(plt.scatter,marker='+')