def dec_network_loglik(settings, zcodes, scodes, VP=False): 'decode using set s and z values (if generated provide a generated miss_list) and return decoded data' argvals = settings.split() args = parser_arguments.getArgs(argvals) print(args) #Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' #Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file, args.batch_size, learning_rate=args.learning_rate, z_dim=args.dim_latent_z, y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, y_dim_partition=args.dim_latent_y_partition) train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) #Get an integer number of batches n_batches = int(np.floor(np.shape(train_data)[0] / args.batch_size)) ######Compute the real miss_mask miss_mask = np.multiply(miss_mask, true_miss_mask) with tf.Session(graph=sess_HVAE) as session: # Add ops to save and restore all the variables. saver = tf.train.Saver() saver.restore(session, network_file_name) print("Model restored: " + network_file_name) print('::::::DECODING:::::::::') start_time = time.time() # Training cycle epoch = 0 samples_list = [] # Constant Gumbel-Softmax parameter (where we have finished the annealing) tau = 1e-3 for i in range(n_batches): data_list, miss_list = read_functions.next_batch( train_data, types_dict, miss_mask, args.batch_size, index_batch=i) #Create inputs for the feed_dict data_list_observed = [ data_list[i] * np.reshape(miss_list[:, i], [args.batch_size, 1]) for i in range(len(data_list)) ] #Delete not known data #Create feed dictionary feedDict = { i: d for i, d in zip(tf_nodes['ground_batch'], data_list) } feedDict.update({ i: d for i, d in zip(tf_nodes['ground_batch_observed'], data_list_observed) }) feedDict[tf_nodes['miss_list']] = miss_list if VP == True: vpfile = 'VP_misslist/' + re.sub( 'data_python/|.csv', '', args.data_file) + '_vpmiss.csv' print('::::::::::::' + vpfile) feedDict[tf_nodes['miss_list_VP']] = pd.read_csv(vpfile, header=None) elif VP == 'nomiss': print(':::::::::::: ones for miss list VP') feedDict[tf_nodes['miss_list_VP']] = np.ones(miss_list.shape) else: feedDict[tf_nodes['miss_list_VP']] = miss_list feedDict[tf_nodes['tau_GS']] = tau feedDict[tf_nodes['zcodes']] = np.array(zcodes).reshape( (len(zcodes), 1)) feedDict[tf_nodes['scodes']] = np.array(scodes).reshape( (len(scodes), 1)) #Get samples from the fixed decoder function samples_zgen, log_p_x_test, log_p_x_missing_test, test_params = session.run( [ tf_nodes['samples_zgen'], tf_nodes['log_p_x_zgen'], tf_nodes['log_p_x_missing_zgen'], tf_nodes['test_params_zgen'] ], feed_dict=feedDict) samples_list.append(samples_zgen) return log_p_x_test
def train_network(settings): 'run training (no output)' argvals = settings.split() args = parser_arguments.getArgs(argvals) print(args) #Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' load_file_name = './Saved_Networks/' + re.sub( '_BNet', '', args.save_file) + '/' + re.sub('_BNet', '', args.save_file) + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' #Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file, args.batch_size, learning_rate=args.learning_rate, z_dim=args.dim_latent_z, y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, y_dim_partition=args.dim_latent_y_partition) ################### Running the VAE Training ################################# train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) n_batches = int( np.floor(np.shape(train_data)[0] / args.batch_size)) #Get an integer number of batches miss_mask = np.multiply(miss_mask, true_miss_mask) #Compute the real miss_mask with tf.Session(graph=sess_HVAE) as session: # Add ops to save and restore all the variables. saver = tf.train.Saver() if args.restore == 1: saver.restore(session, load_file_name) print("Model restored: " + load_file_name) else: print('Initizalizing Variables ...') tf.global_variables_initializer().run() start_time = time.time() # Training cycle loglik_epoch = [] testloglik_epoch = [] KL_s_epoch = [] KL_z_epoch = [] loss_epoch = [] for epoch in range(args.epochs): avg_loss = 0. avg_loss_reg = 0. avg_KL_s = 0. avg_KL_z = 0. samples_list = [] p_params_list = [] q_params_list = [] log_p_x_total = [] log_p_x_missing_total = [] # Annealing of Gumbel-Softmax parameter tau = np.max([1.0 - (0.999 / (args.epochs - 50)) * epoch, 1e-3]) print(tau) #Randomize the data in the mini-batches random_perm = np.random.permutation(range(np.shape(train_data)[0])) train_data_aux = train_data[random_perm, :] miss_mask_aux = miss_mask[random_perm, :] true_miss_mask_aux = true_miss_mask[random_perm, :] for i in range(n_batches): data_list, miss_list = read_functions.next_batch( train_data_aux, types_dict, miss_mask_aux, args.batch_size, index_batch=i) #Create inputs for the feed_dict data_list_observed = [ data_list[i] * np.reshape(miss_list[:, i], [args.batch_size, 1]) for i in range(len(data_list)) ] #Delete not known data (input zeros) #Create feed dictionary feedDict = { i: d for i, d in zip(tf_nodes['ground_batch'], data_list) } feedDict.update({ i: d for i, d in zip(tf_nodes['ground_batch_observed'], data_list_observed) }) feedDict[tf_nodes['miss_list']] = miss_list feedDict[tf_nodes['miss_list_VP']] = np.ones( miss_list.shape ) # only works when running all 1 batch 1 epoch feedDict[tf_nodes['tau_GS']] = tau feedDict[tf_nodes['zcodes']] = np.ones( args.batch_size).reshape( (args.batch_size, 1)) # just for placeholder feedDict[tf_nodes['scodes']] = np.ones( args.batch_size).reshape( (args.batch_size, 1)) # just for placeholder #Running VAE _, loss, KL_z, KL_s, samples, log_p_x, log_p_x_missing, p_params, q_params, loss_reg = session.run( [ tf_nodes['optim'], tf_nodes['loss_re'], tf_nodes['KL_z'], tf_nodes['KL_s'], tf_nodes['samples'], tf_nodes['log_p_x'], tf_nodes['log_p_x_missing'], tf_nodes['p_params'], tf_nodes['q_params'], tf_nodes['loss_reg'] ], feed_dict=feedDict) #Collect all samples, distirbution parameters and logliks in lists samples_list.append(samples) p_params_list.append(p_params) q_params_list.append(q_params) log_p_x_total.append(log_p_x) log_p_x_missing_total.append(log_p_x_missing) # Compute average loss avg_loss += np.mean(loss) avg_KL_s += np.mean(KL_s) avg_KL_z += np.mean(KL_z) avg_loss_reg += np.mean(loss_reg) print('Epoch: ' + str(epoch) + ' Rec. Loss: ' + str(avg_loss / n_batches) + ' KL s: ' + str(avg_KL_s / n_batches) + ' KL z: ' + str(avg_KL_z / n_batches)) loss_epoch.append(-avg_loss / n_batches) if epoch % args.save == 0: print('Saving Variables ... ' + network_file_name) save_path = saver.save(session, network_file_name) print('Training Finished ...') plt.clf() plt.figure() plt.plot(loss_epoch) plt.xlabel('Epoch') plt.ylabel( 'Reconstruction loss') # we already handled the x-label with ax1 plt.title(args.save_file) plt.savefig('Saved_Networks/train_stats/' + args.save_file + '.png', bbox_inches='tight')
avg_loss - avg_KL_z - avg_KL_s, avg_test_loglik)) # Get arguments for parser args = parser_arguments.getArgs(sys.argv[1:]) # Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' print(args) train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) # Check batch size if args.batch_size > n_samples: args.batch_size = n_samples # Get an integer number of batches n_batches = int(np.floor(np.shape(train_data)[0] / args.batch_size)) # Compute the real miss_mask miss_mask = np.multiply(miss_mask, true_miss_mask) # Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file,
def enc_network(settings): 'get s and z samples as embeddings as well as the original dataframe (with relevelled factors & NA\'s=0!)' argvals = settings.split() args = parser_arguments.getArgs(argvals) print(args) #Create a directoy for the save file if not os.path.exists('./Saved_Networks/' + args.save_file): os.makedirs('./Saved_Networks/' + args.save_file) network_file_name = './Saved_Networks/' + args.save_file + '/' + args.save_file + '.ckpt' log_file_name = './Saved_Network/' + args.save_file + '/log_file_' + args.save_file + '.txt' #Creating graph sess_HVAE = tf.Graph() with sess_HVAE.as_default(): tf_nodes = graph_new.HVAE_graph( args.model_name, args.types_file, args.batch_size, learning_rate=args.learning_rate, z_dim=args.dim_latent_z, y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, y_dim_partition=args.dim_latent_y_partition) train_data, types_dict, miss_mask, true_miss_mask, n_samples = read_functions.read_data( args.data_file, args.types_file, args.miss_file, args.true_miss_file) #Get an integer number of batches n_batches = int(np.floor(np.shape(train_data)[0] / args.batch_size)) #Compute the real miss_mask miss_mask = np.multiply(miss_mask, true_miss_mask) with tf.Session(graph=sess_HVAE) as session: # Add ops to save and restore all the variables. saver = tf.train.Saver() saver.restore(session, network_file_name) print("Model restored: " + network_file_name) start_time = time.time() # Training cycle loglik_epoch = [] testloglik_epoch = [] epoch = 0 avg_loss = 0. avg_loss_reg = 0. avg_KL_s = 0. avg_KL_y = 0. avg_KL_z = 0. samples_list = [] samples_list_test = [] p_params_list = [] q_params_list = [] log_p_x_total = [] log_p_x_missing_total = [] # Constant Gumbel-Softmax parameter (where we have finished the annealing) tau = 1e-3 for i in range(n_batches): data_list, miss_list = read_functions.next_batch( train_data, types_dict, miss_mask, args.batch_size, index_batch=i) #Create train minibatch data_list_observed = [ data_list[i] * np.reshape(miss_list[:, i], [args.batch_size, 1]) for i in range(len(data_list)) ] #Delete not known data #Create feed dictionary feedDict = { i: d for i, d in zip(tf_nodes['ground_batch'], data_list) } feedDict.update({ i: d for i, d in zip(tf_nodes['ground_batch_observed'], data_list_observed) }) feedDict[tf_nodes['miss_list']] = miss_list feedDict[tf_nodes['miss_list_VP']] = np.ones( miss_list.shape) # unused feedDict[tf_nodes['tau_GS']] = tau feedDict[tf_nodes['zcodes']] = np.ones(args.batch_size).reshape( (args.batch_size, 1)) feedDict[tf_nodes['scodes']] = np.ones(args.batch_size).reshape( (args.batch_size, 1)) #Get samples from the model KL_s, loss, samples, log_p_x, log_p_x_missing, loss_total, KL_z, p_params, q_params, loss_reg = session.run( [ tf_nodes['KL_s'], tf_nodes['loss_re'], tf_nodes['samples'], tf_nodes['log_p_x'], tf_nodes['log_p_x_missing'], tf_nodes['loss'], tf_nodes['KL_z'], tf_nodes['p_params'], tf_nodes['q_params'], tf_nodes['loss_reg'] ], feed_dict=feedDict) samples_list.append(samples) q_params_list.append(q_params) # Compute average loss avg_loss += np.mean(loss) avg_loss_reg += np.mean(loss_reg) avg_KL_s += np.mean(KL_s) avg_KL_z += np.mean(KL_z) #Transform discrete variables to original values (this is for getting the original data frame) train_data_transformed = read_functions.discrete_variables_transformation( train_data, types_dict) #Create global dictionary of the distribution parameters q_params_complete = read_functions.q_distribution_params_concatenation( q_params_list, args.dim_latent_z, args.dim_latent_s) # return the deterministic and sampled s and z codes and the reconstructed dataframe (now imputed) encs = np.argmax(q_params_complete['s'], 1) encz = q_params_complete['z'][0, :, :] return [encs, encz, train_data_transformed]
# In[5]: data_file = 'Wine/data.csv' types_file = 'Wine/data_types.csv' miss_file = 'Wine/Missing10-50_1.csv' true_miss_file = None batch_size = 128 # In[6]: train_data, types_dict, miss_mask, true_miss_mask, n_samples = rf.read_data(data_file, types_file, miss_file, true_miss_file) # Randomize the data in the mini-batches random_perm = np.random.permutation(range(np.shape(train_data)[0])) train_data_aux = train_data[random_perm, :] miss_mask_aux = miss_mask[random_perm, :] true_miss_mask_aux = true_miss_mask[random_perm, :] # Check batch size if batch_size > n_samples: batch_size = n_samples for t in types_dict: t['dim'] = int(t['dim']) # Compute the real miss_mask