예제 #1
0
def train(params, x_data, y_data_h, save_dir):

    x_data = x_data
    y_data_train_l = y_data_h

    # USEFUL SIZES
    xsh = np.shape(x_data)
    ysh1 = np.shape(y_data_h)[1]

    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights = params['n_weights']
    lam = 10

    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():
        tf.set_random_seed(np.random.randint(0, 10))
        SMALL_CONSTANT = 1e-6

        # PLACE HOLDERS
        x_ph = tf.placeholder(dtype=tf.float32,
                              shape=[None, xsh[1]],
                              name="x_ph")
        bs_ph = tf.placeholder(dtype=tf.int64,
                               name="bs_ph")  # batch size placeholder

        # LOAD VICI NEURAL NETWORKS
        autoencoder = VICI_decoder.VariationalAutoencoder(
            "VICI_decoder", xsh[1], z_dimension + ysh1, n_weights)
        autoencoder_ENC = VICI_encoder.VariationalAutoencoder(
            "VICI_encoder", ysh1, z_dimension, n_weights)
        autoencoder_VAE = VICI_VAE_encoder.VariationalAutoencoder(
            "VICI_VAE_encoder", xsh[1] + ysh1, z_dimension, n_weights)

        # GET r(z|y)
        x_ph_n = tf_normalise_dataset(x_ph)
        y_ph = tf.placeholder(dtype=tf.float32,
                              shape=[None, ysh1],
                              name="y_ph")
        y_ph_n = tf_normalise_dataset(y_ph)
        zy_mean, zy_log_sig_sq = autoencoder_ENC._calc_z_mean_and_sigma(y_ph_n)

        # DRAW FROM r(z|y)
        rzy_samp = autoencoder_VAE._sample_from_gaussian_dist(
            bs_ph, z_dimension, zy_mean, zy_log_sig_sq)

        # GET r(x|z,y) from r(z|y) samples
        rzy_samp_y = tf.concat([rzy_samp, y_ph_n], 1)
        reconstruction_xzy = autoencoder.calc_reconstruction(rzy_samp_y)
        x_mean = reconstruction_xzy[0]
        x_log_sig_sq = reconstruction_xzy[1]

        # KL(r(z|y)||p(z))
        latent_loss = -0.5 * tf.reduce_sum(
            1 + zy_log_sig_sq - tf.square(zy_mean) - tf.exp(zy_log_sig_sq), 1)
        KL = tf.reduce_mean(latent_loss)

        # GET q(z|x,y)
        xy_ph = tf.concat([x_ph_n, y_ph_n], 1)
        zx_mean, zx_log_sig_sq = autoencoder_VAE._calc_z_mean_and_sigma(xy_ph)

        # DRAW FROM q(z|x,y)
        qzx_samp = autoencoder_VAE._sample_from_gaussian_dist(
            bs_ph, z_dimension, zx_mean, zx_log_sig_sq)

        # GET r(x|z,y)
        qzx_samp_y = tf.concat([qzx_samp, y_ph_n], 1)
        reconstruction_xzx = autoencoder.calc_reconstruction(qzx_samp_y)
        x_mean_vae = reconstruction_xzx[0]
        x_log_sig_sq_vae = reconstruction_xzx[1]

        # COST FROM RECONSTRUCTION
        normalising_factor_x_vae = -0.5 * tf.log(SMALL_CONSTANT + tf.exp(
            x_log_sig_sq_vae)) - 0.5 * np.log(2 * np.pi)
        square_diff_between_mu_and_x_vae = tf.square(x_mean_vae - x_ph_n)
        inside_exp_x_vae = -0.5 * tf.div(
            square_diff_between_mu_and_x_vae,
            SMALL_CONSTANT + tf.exp(x_log_sig_sq_vae))
        reconstr_loss_x_vae = -tf.reduce_sum(
            normalising_factor_x_vae + inside_exp_x_vae, 1)
        cost_R_vae = tf.reduce_mean(reconstr_loss_x_vae)

        # KL(q(z|x,y)||r(z|y))
        v_mean = zy_mean  #2
        aux_mean = zx_mean  #1
        v_log_sig_sq = tf.log(tf.exp(zy_log_sig_sq) + SMALL_CONSTANT)  #2
        aux_log_sig_sq = tf.log(tf.exp(zx_log_sig_sq) + SMALL_CONSTANT)  #1
        v_log_sig = tf.log(tf.sqrt(tf.exp(v_log_sig_sq)))  #2
        aux_log_sig = tf.log(tf.sqrt(tf.exp(aux_log_sig_sq)))  #1
        cost_VAE_a = v_log_sig - aux_log_sig + tf.divide(
            tf.exp(aux_log_sig_sq) + tf.square(aux_mean - v_mean),
            2 * tf.exp(v_log_sig_sq)) - 0.5
        cost_VAE_b = tf.reduce_sum(cost_VAE_a, 1)
        KL_vae = tf.reduce_mean(cost_VAE_b)

        # THE VICI COST FUNCTION
        lam_ph = tf.placeholder(dtype=tf.float32, name="lam_ph")
        COST_VAE = KL_vae + cost_R_vae
        COST = COST_VAE

        # VARIABLES LISTS
        var_list_VICI = [
            var for var in tf.trainable_variables()
            if var.name.startswith("VICI")
        ]

        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate'])
        minimize = optimizer.minimize(COST, var_list=var_list_VICI)

        # DRAW FROM q(x|y)
        qx_samp = autoencoder_ENC._sample_from_gaussian_dist(
            bs_ph, xsh[1], x_mean,
            SMALL_CONSTANT + tf.log(tf.exp(x_log_sig_sq)))

        # INITIALISE AND RUN SESSION
        #        init = tf.variables_initializer(var_list_VICI)
        init = tf.initialize_all_variables()
        session.run(init)
        saver = tf.train.Saver()

    KL_PLOT = np.zeros(
        np.int(
            np.round(params['num_iterations'] / params['report_interval']) +
            1))  # vector to store test OELBO values
    COST_PLOT = np.zeros(
        np.int(
            np.round(params['num_iterations'] / params['report_interval']) +
            1))  # vector to store test VAE ELBO values

    print('Training CVAE Inference Model...')
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(
        params['batch_size'], xsh[0])
    ni = -1
    test_n = 100
    for i in range(params['num_iterations']):

        next_indices = indices_generator.next_indices()

        yn = y_data_train_l[next_indices, :]
        session.run(minimize,
                    feed_dict={
                        bs_ph: bs,
                        x_ph: x_data[next_indices, :],
                        y_ph: yn,
                        lam_ph: lam
                    })  # minimising cost function

        if i % params['report_interval'] == 0:
            ni = ni + 1

            ynt = y_data_train_l[0:test_n, :]
            cost_value_vae, KL_VAE = session.run([COST_VAE, KL_vae],
                                                 feed_dict={
                                                     bs_ph: test_n,
                                                     x_ph: x_data[0:test_n, :],
                                                     y_ph: ynt,
                                                     lam_ph: lam
                                                 })
            KL_PLOT[ni] = KL_VAE
            COST_PLOT[ni] = cost_value_vae

            if params['print_values'] == True:
                print(
                    '--------------------------------------------------------------'
                )
                print('Iteration:', i)
                print('Test Set ELBO:', -cost_value_vae)
                print('KL Divergence:', KL_VAE)

        if i % params['save_interval'] == 0:

            save_path = saver.save(session, save_dir)

    return COST_PLOT, KL_PLOT
예제 #2
0
def train(params, x_data, y_data_l, siz_high_res, load_dir, save_dir, plotter, y_data_test):
    
    x_data = x_data
    y_data_train_l = y_data_l
    
    # USEFUL SIZES
    xsh = np.shape(x_data)
    yshl1 = np.shape(y_data_l)[1]
    ysh1 = siz_high_res
    
    z_dimension_fm = params['z_dimensions_fw']
    n_weights_fm = params['n_weights_fw']
    
    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights = params['n_weights']
    lam = 1
    
    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():
        tf.set_random_seed(np.random.randint(0,10))
        SMALL_CONSTANT = 1e-6
        
        # PLACE HOLDERS
        x_ph = tf.placeholder(dtype=tf.float32, shape=[None, xsh[1]], name="x_ph")
        bs_ph = tf.placeholder(dtype=tf.int64, name="bs_ph") # batch size placeholder
        yt_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="yt_ph")
        
        # LOAD FORWARD MODEL NEURAL NETWORKS
        DEC_XYlZtoYh = OELBO_decoder_difference.VariationalAutoencoder("OELBO_decoder", ysh1, z_dimension_fm+yshl1+xsh[1], n_weights_fm) # p(Yh|X,Yl,Z)
        ENC_XYltoZ = OELBO_encoder.VariationalAutoencoder("OELBO_encoder", yshl1+xsh[1], z_dimension_fm, n_weights_fm) # p(Z|X,Yl)
        ENC_XYhYltoZ = VAE_encoder.VariationalAutoencoder("vae_encoder", xsh[1]+ysh1+yshl1, z_dimension_fm, n_weights_fm) # q(Z|X,Yl,Yh)
        
        # LOAD VICI NEURAL NETWORKS
        autoencoder = VICI_decoder.VariationalAutoencoder("VICI_decoder", xsh[1], z_dimension+ysh1, n_weights)
        autoencoder_ENC = VICI_encoder.VariationalAutoencoder("VICI_encoder", ysh1, z_dimension, n_weights)
        autoencoder_VAE = VICI_VAE_encoder.VariationalAutoencoder("VICI_VAE_encoder", xsh[1]+ysh1, z_dimension, n_weights)
        
        # DEFINE MULTI-FIDELITY FORWARD MODEL
        #####################################################################################################################
        SMALL_CONSTANT = 1e-6
        
        # NORMALISE INPUTS
        yl_ph_n = tf_normalise_dataset(yt_ph)
        x_ph_n = tf_normalise_dataset(x_ph)
        #yl_ph_n = yt_ph
        #x_ph_n = x_ph
        
        # GET p(Z|X,Yl)
        zxyl_mean,zxyl_log_sig_sq = ENC_XYltoZ._calc_z_mean_and_sigma(tf.concat([x_ph_n,yl_ph_n],1))
        rxyl_samp = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(x_ph_n)[0], z_dimension_fm, zxyl_mean, tf.log(tf.exp(zxyl_log_sig_sq)+SMALL_CONSTANT))
        
        # GET p(Yh|X,Yl,Z) FROM SAMPLES Z ~ p(Z|X,Yl)
        reconstruction_yh = DEC_XYlZtoYh.calc_reconstruction(tf.concat([x_ph_n,yl_ph_n,rxyl_samp],1))
        yh_diff = reconstruction_yh[0]
        yh_mean = yl_ph_n+yh_diff
        yh_log_sig_sq = reconstruction_yh[1]
        y = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(yt_ph)[0], tf.shape(yt_ph)[1], yh_mean, yh_log_sig_sq)
        
        # DRAW SYNTHETIC Ys TRAINING DATA
#        _, y = OM.forward_model(x_ph_amp,x_ph_ph)
        
        ##########################################################################################################################################
        
        # GET r(z|y)
        y_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="y_ph")
        y_ph_n = tf_normalise_dataset(y_ph)
        #y_ph_n = y_ph
        zy_mean,zy_log_sig_sq = autoencoder_ENC._calc_z_mean_and_sigma(y_ph_n)
        
        # DRAW FROM r(z|y)
        rzy_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zy_mean, zy_log_sig_sq)
        
        # GET r(x|z,y) from r(z|y) samples
        rzy_samp_y = tf.concat([rzy_samp,y_ph_n],1)
        reconstruction_xzy = autoencoder.calc_reconstruction(rzy_samp_y)
        x_mean = reconstruction_xzy[0]
        x_log_sig_sq = reconstruction_xzy[1]
        
        # KL(r(z|y)||p(z))
        latent_loss = -0.5 * tf.reduce_sum(1 + zy_log_sig_sq - tf.square(zy_mean) - tf.exp(zy_log_sig_sq), 1)
        KL = tf.reduce_mean(latent_loss)
        
        # GET q(z|x,y)
        xy_ph = tf.concat([x_ph_n,y_ph_n],1)
        zx_mean,zx_log_sig_sq = autoencoder_VAE._calc_z_mean_and_sigma(xy_ph)
        
        # DRAW FROM q(z|x,y)
        qzx_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zx_mean, zx_log_sig_sq)
        
        # GET r(x|z,y)
        qzx_samp_y = tf.concat([qzx_samp,y_ph_n],1)
        reconstruction_xzx = autoencoder.calc_reconstruction(qzx_samp_y)
        x_mean_vae = reconstruction_xzx[0]
        x_log_sig_sq_vae = reconstruction_xzx[1]
        
        # COST FROM RECONSTRUCTION
        normalising_factor_x_vae = - 0.5 * tf.log(SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae)) - 0.5 * np.log(2 * np.pi)
        square_diff_between_mu_and_x_vae = tf.square(x_mean_vae - x_ph_n)
        inside_exp_x_vae = -0.5 * tf.div(square_diff_between_mu_and_x_vae,SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae))
        reconstr_loss_x_vae = -tf.reduce_sum(normalising_factor_x_vae + inside_exp_x_vae, 1)
        cost_R_vae = tf.reduce_mean(reconstr_loss_x_vae)
        
        # KL(q(z|x,y)||r(z|y))
        v_mean = zy_mean #2
        aux_mean = zx_mean #1
        v_log_sig_sq = tf.log(tf.exp(zy_log_sig_sq)+SMALL_CONSTANT) #2
        aux_log_sig_sq = tf.log(tf.exp(zx_log_sig_sq)+SMALL_CONSTANT) #1
        v_log_sig = tf.log(tf.sqrt(tf.exp(v_log_sig_sq))) #2
        aux_log_sig = tf.log(tf.sqrt(tf.exp(aux_log_sig_sq))) #1
        cost_VAE_a = v_log_sig-aux_log_sig+tf.divide(tf.exp(aux_log_sig_sq)+tf.square(aux_mean-v_mean),2*tf.exp(v_log_sig_sq))-0.5
        cost_VAE_b = tf.reduce_sum(cost_VAE_a,1)
        KL_vae = tf.reduce_mean(cost_VAE_b)
        
        # THE VICI COST FUNCTION
        lam_ph = tf.placeholder(dtype=tf.float32, name="lam_ph")
        COST_VAE = KL_vae+cost_R_vae
        COST = COST_VAE
        
        # VARIABLES LISTS
        var_list_VICI = [var for var in tf.trainable_variables() if var.name.startswith("VICI")]
        var_list_ELBO = [var for var in tf.trainable_variables() if var.name.startswith("ELBO")]
        
        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate']) 
        minimize = optimizer.minimize(COST,var_list = var_list_VICI)
        
        # DRAW FROM q(x|y)
        qx_samp = autoencoder_ENC._sample_from_gaussian_dist(bs_ph, xsh[1], x_mean, SMALL_CONSTANT + tf.log(tf.exp(x_log_sig_sq)))
        
        # INITIALISE AND RUN SESSION
#        init = tf.variables_initializer(var_list_VICI)
        init = tf.initialize_all_variables()
        session.run(init)
        saver_ELBO = tf.train.Saver(var_list_ELBO)
        saver_ELBO.restore(session,load_dir)
        saver = tf.train.Saver()
    
    KL_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test OELBO values
    COST_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test VAE ELBO values
    
    print('Training Inference Model...')    
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(params['batch_size'], xsh[0])
    ni = -1
    test_n = 100
    for i in range(params['num_iterations']):
        
        next_indices = indices_generator.next_indices()
        
        yn = session.run(y, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :], yt_ph:y_data_train_l[next_indices, :]})
        session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :],  y_ph:yn, lam_ph:lam, yt_ph:y_data_train_l[next_indices, :]}) # minimising cost function
        
        if i % params['report_interval'] == 0:
                ni = ni+1
                
                ynt = session.run(y, feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], yt_ph:y_data_train_l[0:test_n,:]})
                cost_value_vae, KL_VAE = session.run([COST_VAE, KL_vae], feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], y_ph:ynt, lam_ph:lam, yt_ph:y_data_train_l[0:test_n,:]})
                KL_PLOT[ni] = KL_VAE
                COST_PLOT[ni] = cost_value_vae
                
                if params['print_values']==True:
                    print('--------------------------------------------------------------')
                    print('Iteration:',i)
                    print('Training Set ELBO:',-cost_value_vae)
                    print('KL Divergence:',KL_VAE)
       
        if i % params['save_interval'] == 0:

            """
            # ESTIMATE TEST SET RECONSTRUCTION PER-PIXEL APPROXIMATE MARGINAL LIKELIHOOD and draw from q(x|y)
            n_ex_s = params['n_samples'] # number of samples to save per reconstruction
            ns = np.maximum(100,n_ex_s) # number of samples to use to estimate per-pixel marginal

            XM = np.zeros((np.shape(y_data_test)[0],params['ndim_x'],ns))
            XSX = np.zeros((np.shape(y_data_test)[0],params['ndim_x'],ns))
            XSA = np.zeros((np.shape(y_data_test)[0],params['ndim_x'],ns))

            for i in range(ns):
                rec_x_m = session.run(x_mean,feed_dict={y_ph:y_data_test})
                rec_x_mx = session.run(qx_samp,feed_dict={y_ph:y_data_test})
                rec_x_s = session.run(x_mean,feed_dict={y_ph:y_data_test})
                XM[:,:,i] = rec_x_m
                XSX[:,:,i] = rec_x_mx
                XSA[:,:,i] = rec_x_s

            pmax = session.run(x_pmax,feed_dict={y_ph:y_data_test})

            xm = np.mean(XM,axis=2)
            xsx = np.std(XSX,axis=2)
            xs = np.std(XM,axis=2)
            #XS = XSX[:,:,0:n_ex_s]
            XS = XSA[:,:,0:n_ex_s]
                
            # Generate overlap scatter plots
            plotter.rev_x = XS
            plotter.make_overlap_plot()
            """

            # Save model 
            save_path = saver.save(session,save_dir)
                
                
    return COST_PLOT, KL_PLOT
예제 #3
0
def train(params, x_data, y_data, x_data_test, y_data_test, y_data_test_noisefree, y_normscale, save_dir, truth_test, bounds, fixed_vals, posterior_truth_test,snrs_test=None):    

    # if True, do multi-modal
    multi_modal = True

    # USEFUL SIZES
    xsh = np.shape(x_data)
   
    ysh = np.shape(y_data)[1]
    n_convsteps = params['n_convsteps']
    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights_r1 = params['n_weights_r1']
    n_weights_r2 = params['n_weights_r2']
    n_weights_q = params['n_weights_q']
    n_modes = params['n_modes']
    n_hlayers_r1 = len(params['n_weights_r1'])
    n_hlayers_r2 = len(params['n_weights_r2'])
    n_hlayers_q = len(params['n_weights_q'])
    n_conv_r1 = len(params['n_filters_r1'])
    n_conv_r2 = len(params['n_filters_r2'])
    n_conv_q = len(params['n_filters_q'])
    n_filters_r1 = params['n_filters_r1']
    n_filters_r2 = params['n_filters_r2']
    n_filters_q = params['n_filters_q']
    filter_size_r1 = params['filter_size_r1']
    filter_size_r2 = params['filter_size_r2']
    filter_size_q = params['filter_size_q']
    maxpool_r1 = params['maxpool_r1']
    maxpool_r2 = params['maxpool_r2']
    maxpool_q = params['maxpool_q']
    conv_strides_r1 = params['conv_strides_r1']
    conv_strides_r2 = params['conv_strides_r2']
    conv_strides_q = params['conv_strides_q']
    pool_strides_r1 = params['pool_strides_r1']
    pool_strides_r2 = params['pool_strides_r2']
    pool_strides_q = params['pool_strides_q']
    batch_norm = params['batch_norm']
    red = params['reduce']
    if n_convsteps != None:
        ysh_conv_r1 = int(ysh*n_filters_r1/2**n_convsteps) if red==True else int(ysh/2**n_convsteps)
        ysh_conv_r2 = int(ysh*n_filters_r2/2**n_convsteps) if red==True else int(ysh/2**n_convsteps)
        ysh_conv_q = int(ysh*n_filters_q/2**n_convsteps) if red==True else int(ysh/2**n_convsteps)
    else:
        ysh_conv_r1 = int(ysh_r1)
        ysh_conv_r2 = int(ysh_r2)
        ysh_conv_q = int(ysh_q)
    drate = params['drate']
    ramp_start = params['ramp_start']
    ramp_end = params['ramp_end']
    num_det = len(fixed_vals['det'])


    # identify the indices of different sets of physical parameters
    vonmise_mask, vonmise_idx_mask, vonmise_len = get_param_index(params['inf_pars'],params['vonmise_pars'])
    gauss_mask, gauss_idx_mask, gauss_len = get_param_index(params['inf_pars'],params['gauss_pars'])
    sky_mask, sky_idx_mask, sky_len = get_param_index(params['inf_pars'],params['sky_pars'])
    ra_mask, ra_idx_mask, ra_len = get_param_index(params['inf_pars'],['ra'])
    dec_mask, dec_idx_mask, dec_len = get_param_index(params['inf_pars'],['dec'])
    m1_mask, m1_idx_mask, m1_len = get_param_index(params['inf_pars'],['mass_1'])
    m2_mask, m2_idx_mask, m2_len = get_param_index(params['inf_pars'],['mass_2'])
    idx_mask = np.argsort(gauss_idx_mask + vonmise_idx_mask + m1_idx_mask + m2_idx_mask + sky_idx_mask) # + dist_idx_mask)

    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():

        # PLACE HOLDERS
        bs_ph = tf.placeholder(dtype=tf.int64, name="bs_ph")                       # batch size placeholder
        x_ph = tf.placeholder(dtype=tf.float32, shape=[None, xsh[1]], name="x_ph") # params placeholder
        y_ph = tf.placeholder(dtype=tf.float32, shape=[None, params['ndata'], num_det], name="y_ph")
        ramp = tf.placeholder(dtype=tf.float32)    # the ramp to slowly increase the KL contribution

        # LOAD VICI NEURAL NETWORKS
        r1_zy = VICI_encoder.VariationalAutoencoder('VICI_encoder', n_input=params['ndata'], n_output=z_dimension, n_channels=num_det, n_weights=n_weights_r1,   # generates params for r1(z|y)
                                                    n_modes=n_modes, drate=drate, n_filters=n_filters_r1, 
                                                    filter_size=filter_size_r1, maxpool=maxpool_r1)
        r2_xzy = VICI_decoder.VariationalAutoencoder('VICI_decoder', vonmise_mask, gauss_mask, m1_mask, m2_mask, sky_mask, n_input1=z_dimension, 
                                                     n_input2=params['ndata'], n_output=xsh[1], n_channels=num_det, n_weights=n_weights_r2, 
                                                     drate=drate, n_filters=n_filters_r2, 
                                                     filter_size=filter_size_r2, maxpool=maxpool_r2)
        q_zxy = VICI_VAE_encoder.VariationalAutoencoder('VICI_VAE_encoder', n_input1=xsh[1], n_input2=params['ndata'], n_output=z_dimension, 
                                                     n_channels=num_det, n_weights=n_weights_q, drate=drate, 
                                                     n_filters=n_filters_q, filter_size=filter_size_q, maxpool=maxpool_q) 
        tf.set_random_seed(np.random.randint(0,10))

        # reduce the y data size
        y_conv = y_ph

        # GET r1(z|y)
        # run inverse autoencoder to generate mean and logvar of z given y data - these are the parameters for r1(z|y)
        r1_loc, r1_scale, r1_weight = r1_zy._calc_z_mean_and_sigma(y_conv)
        temp_var_r1 = SMALL_CONSTANT + tf.exp(r1_scale)

        
        # define the r1(z|y) mixture model
        bimix_gauss = tfd.MixtureSameFamily(
                          mixture_distribution=tfd.Categorical(logits=r1_weight),
                          components_distribution=tfd.MultivariateNormalDiag(
                          loc=r1_loc,
                          scale_diag=tf.sqrt(temp_var_r1)))


        # DRAW FROM r1(z|y) - given the Gaussian parameters generate z samples
        r1_zy_samp = bimix_gauss.sample()        
        
        # GET q(z|x,y)
        q_zxy_mean, q_zxy_log_sig_sq = q_zxy._calc_z_mean_and_sigma(x_ph,y_conv)

        # DRAW FROM q(z|x,y)
        temp_var_q = SMALL_CONSTANT + tf.exp(q_zxy_log_sig_sq)
        mvn_q = tfp.distributions.MultivariateNormalDiag(
                          loc=q_zxy_mean,
                          scale_diag=tf.sqrt(temp_var_q))
        q_zxy_samp = mvn_q.sample()  
       
        # GET r2(x|z,y)
        eps = tf.random.normal([bs_ph, params['ndata'], num_det], 0, 1., dtype=tf.float32)
        y_ph_ramp = tf.add(tf.multiply(ramp,y_conv), tf.multiply((1.0-ramp), eps))
        reconstruction_xzy = r2_xzy.calc_reconstruction(q_zxy_samp,y_ph_ramp)

        # ugly but required for now - unpack the r2 output params
        r2_xzy_mean_gauss = reconstruction_xzy[0]           # truncated gaussian mean
        r2_xzy_log_sig_sq_gauss = reconstruction_xzy[1]     # truncated gaussian log var
        r2_xzy_mean_vonmise = reconstruction_xzy[2]         # vonmises means
        r2_xzy_log_sig_sq_vonmise = reconstruction_xzy[3]   # vonmises log var
        r2_xzy_mean_m1 = reconstruction_xzy[4]              # m1 mean
        r2_xzy_log_sig_sq_m1 = reconstruction_xzy[5]        # m1 var
        r2_xzy_mean_m2 = reconstruction_xzy[6]              # m2 mean (m2 will be conditional on m1)
        r2_xzy_log_sig_sq_m2 = reconstruction_xzy[7]        # m2 log var (m2 will be conditional on m1)
        r2_xzy_mean_sky = reconstruction_xzy[8]             # sky mean unit vector (3D)
        r2_xzy_log_sig_sq_sky = reconstruction_xzy[9]       # sky log var (1D)

        # COST FROM RECONSTRUCTION - the masses
        # this sets up a joint distribution on m1 and m2 with m2 being conditional on m1
        # the ramp eveolves the truncation boundaries from far away to 0->1 for m1 and 0->m1 for m2
        if m1_len>0 and m2_len>0:
            temp_var_r2_m1 = SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_m1)     # the safe r2 variance
            temp_var_r2_m2 = SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_m2)
            joint = tfd.JointDistributionSequential([    # shrink the truncation with the ramp
                       tfd.Independent(tfd.TruncatedNormal(r2_xzy_mean_m1,tf.sqrt(temp_var_r2_m1),-GAUSS_RANGE*(1.0-ramp),GAUSS_RANGE*(1.0-ramp) + 1.0),reinterpreted_batch_ndims=0),  # m1
                lambda b0: tfd.Independent(tfd.TruncatedNormal(r2_xzy_mean_m2,tf.sqrt(temp_var_r2_m2),-GAUSS_RANGE*(1.0-ramp),GAUSS_RANGE*(1.0-ramp) + ramp*b0),reinterpreted_batch_ndims=0)],    # m2
            )
            reconstr_loss_masses = joint.log_prob((tf.boolean_mask(x_ph,m1_mask,axis=1),tf.boolean_mask(x_ph,m2_mask,axis=1)))

        # COST FROM RECONSTRUCTION - Truncated Gaussian parts
        # this sets up a loop over uncorreltaed truncated Gaussians 
        # the ramp evolves the boundaries from far away to 0->1 
        if gauss_len>0:
            temp_var_r2_gauss = SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_gauss)
            gauss_x = tf.boolean_mask(x_ph,gauss_mask,axis=1)
            @tf.function
            def truncnorm(i,lp):    # we set up a function that adds the log-likelihoods and also increments the counter
                loc = tf.slice(r2_xzy_mean_gauss,[0,i],[-1,1])
                std = tf.sqrt(tf.slice(temp_var_r2_gauss,[0,i],[-1,1]))
                pos = tf.slice(gauss_x,[0,i],[-1,1])  
                tn = tfd.TruncatedNormal(loc,std,-GAUSS_RANGE*(1.0-ramp),GAUSS_RANGE*(1.0-ramp) + 1.0)   # shrink the truncation with the ramp
                return [i+1, lp + tn.log_prob(pos)]
            # we do the loop until we've hit all the truncated gaussian parameters - i starts at 0 and the logprob starts at 0 
            _,reconstr_loss_gauss = tf.while_loop(lambda i,reconstr_loss_gauss: i<gauss_len, truncnorm, [0,tf.zeros([bs_ph],dtype=tf.dtypes.float32)])

        # COST FROM RECONSTRUCTION - Von Mises parts for single parameters that wrap over 2pi
        if vonmise_len>0:
            temp_var_r2_vonmise = SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_vonmise)
            con = tf.reshape(tf.math.reciprocal(temp_var_r2_vonmise),[-1,vonmise_len])   # modelling wrapped scale output as log variance - convert to concentration
            von_mises = tfp.distributions.VonMises(
                          loc=2.0*np.pi*(tf.reshape(r2_xzy_mean_vonmise,[-1,vonmise_len])-0.5),   # remap 0>1 mean onto -pi->pi range
                          concentration=con)
            reconstr_loss_vonmise = von_mises.log_prob(2.0*np.pi*(tf.reshape(tf.boolean_mask(x_ph,vonmise_mask,axis=1),[-1,vonmise_len]) - 0.5))   # 2pi is the von mises input range
            
            reconstr_loss_vonmise = reconstr_loss_vonmise[:,0] + reconstr_loss_vonmise[:,1]

            # computing Gaussian likelihood for von mises parameters to be faded away with the ramp
            gauss_vonmises = tfp.distributions.MultivariateNormalDiag(
                         loc=r2_xzy_mean_vonmise,
                         scale_diag=tf.sqrt(temp_var_r2_vonmise))
            reconstr_loss_gauss_vonmise = gauss_vonmises.log_prob(tf.boolean_mask(x_ph,vonmise_mask,axis=1))        
            reconstr_loss_vonmise = ramp*reconstr_loss_vonmise + (1.0-ramp)*reconstr_loss_gauss_vonmise    # start with a Gaussian model and fade in the true vonmises
        else:
            reconstr_loss_vonmise = 0.0

        # COST FROM RECONSTRUCTION - Von Mises Fisher (sky) parts
        if sky_len>0:
            temp_var_r2_sky = SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_sky)
            con = tf.reshape(tf.math.reciprocal(temp_var_r2_sky),[bs_ph])   # modelling wrapped scale output as log variance - only 1 concentration parameter for all sky
            loc_xyz = tf.math.l2_normalize(tf.reshape(r2_xzy_mean_sky,[-1,3]),axis=1)    # take the 3 output mean params from r2 and normalse so they are a unit vector
            von_mises_fisher = tfp.distributions.VonMisesFisher(
                          mean_direction=loc_xyz,
                          concentration=con)
            ra_sky = 2.0*np.pi*tf.reshape(tf.boolean_mask(x_ph,ra_mask,axis=1),[-1,1])       # convert the scaled 0->1 true RA value back to radians
            dec_sky = np.pi*(tf.reshape(tf.boolean_mask(x_ph,dec_mask,axis=1),[-1,1]) - 0.5) # convert the scaled 0>1 true dec value back to radians
            xyz_unit = tf.reshape(tf.concat([tf.cos(ra_sky)*tf.cos(dec_sky),tf.sin(ra_sky)*tf.cos(dec_sky),tf.sin(dec_sky)],axis=1),[-1,3])   # construct the true parameter unit vector
            reconstr_loss_sky = von_mises_fisher.log_prob(tf.math.l2_normalize(xyz_unit,axis=1))   # normalise it for safety (should already be normalised) and compute the logprob

            # computing Gaussian likelihood for von mises Fisher (sky) parameters to be faded away with the ramp
            mean_ra = tf.math.floormod(tf.atan2(tf.slice(loc_xyz,[0,1],[-1,1]),tf.slice(loc_xyz,[0,0],[-1,1])),2.0*np.pi)/(2.0*np.pi)    # convert the unit vector to scaled 0->1 RA 
            mean_dec = (tf.asin(tf.slice(loc_xyz,[0,2],[-1,1])) + 0.5*np.pi)/np.pi        # convert the unit vector to scaled 0->1 dec
            mean_sky = tf.reshape(tf.concat([mean_ra,mean_dec],axis=1),[bs_ph,2])        # package up the scaled RA and dec 
            gauss_sky = tfp.distributions.MultivariateNormalDiag(
                         loc=mean_sky,
                         scale_diag=tf.concat([tf.sqrt(temp_var_r2_sky),tf.sqrt(temp_var_r2_sky)],axis=1))   # use the same 1D concentration parameter for both RA and dec dimensions
            reconstr_loss_gauss_sky = gauss_sky.log_prob(tf.boolean_mask(x_ph,sky_mask,axis=1))     # compute the logprob at the true sky location
            reconstr_loss_sky = ramp*reconstr_loss_sky + (1.0-ramp)*reconstr_loss_gauss_sky   # start with a Gaussian model and fade in the true vonmises Fisher

        cost_R = -1.0*tf.reduce_mean(reconstr_loss_gauss + reconstr_loss_vonmise + reconstr_loss_masses + reconstr_loss_sky)
        r2_xzy_mean = tf.gather(tf.concat([r2_xzy_mean_gauss,r2_xzy_mean_vonmise,r2_xzy_mean_m1,r2_xzy_mean_m2,r2_xzy_mean_sky],axis=1),tf.constant(idx_mask),axis=1)      # put the elements back in order
        r2_xzy_scale = tf.gather(tf.concat([r2_xzy_log_sig_sq_gauss,r2_xzy_log_sig_sq_vonmise,r2_xzy_log_sig_sq_m1,r2_xzy_log_sig_sq_m2,r2_xzy_log_sig_sq_sky],axis=1),tf.constant(idx_mask),axis=1)   # put the elements back in order
        
        log_q_q = mvn_q.log_prob(q_zxy_samp)
        log_r1_q = bimix_gauss.log_prob(q_zxy_samp)   # evaluate the log prob of r1 at the q samples
        KL = tf.reduce_mean(log_q_q - log_r1_q)      # average over batch

        # THE VICI COST FUNCTION
        COST = cost_R + ramp*KL #+ L1_weight_reg)

        # VARIABLES LISTS
        var_list_VICI = [var for var in tf.trainable_variables() if var.name.startswith("VICI")]
        
        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate']) 
#        optimizer = tf.train.RMSPropOptimizer(params['initial_training_rate'])
        minimize = optimizer.minimize(COST,var_list = var_list_VICI)
        
        # INITIALISE AND RUN SESSION
        init = tf.global_variables_initializer()
        session.run(init)
        saver = tf.train.Saver()

    print('Training Inference Model...')    
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(params['batch_size'], xsh[0])
    plotdata = []

    load_chunk_it = 1
    for i in range(params['num_iterations']):

        next_indices = indices_generator.next_indices()

        # if load chunks true, load in data by chunks
        if params['load_by_chunks'] == True and i == int(params['load_iteration']*load_chunk_it):
            x_data, y_data = load_chunk(params['train_set_dir'],params['inf_pars'],params,bounds,fixed_vals)
            load_chunk_it += 1

        # Make noise realizations and add to training data
        next_x_data = x_data[next_indices,:]
        if params['reduce'] == True or n_conv_r1 != None:
            next_y_data = y_data[next_indices,:] + np.random.normal(0,1,size=(params['batch_size'],int(params['ndata']),len(fixed_vals['det'])))
        else:
            next_y_data = y_data[next_indices,:] + np.random.normal(0,1,size=(params['batch_size'],int(params['ndata']*len(fixed_vals['det']))))
        next_y_data /= y_normscale  # required for fast convergence

        if params['by_channel'] == False:
            next_y_data_new = [] 
            for sig in next_y_data:
                next_y_data_new.append(sig.T)
            next_y_data = np.array(next_y_data_new)
            del next_y_data_new
      
        # restore session if wanted
        if params['resume_training'] == True and i == 0:
            print(save_dir)
            saver.restore(session, save_dir)

        # compute the ramp value
        rmp = 0.0
        if params['ramp'] == True:
            if i>ramp_start:
                rmp = (np.log10(float(i)) - np.log10(ramp_start))/(np.log10(ramp_end) - np.log10(ramp_start))
            if i>ramp_end:
                rmp = 1.0  
        else:
            rmp = 1.0              

        # train the network 
        session.run(minimize, feed_dict={bs_ph:bs, x_ph:next_x_data, y_ph:next_y_data, ramp:rmp}) 
 
        # if we are in a report iteration extract cost function values
        if i % params['report_interval'] == 0 and i > 0:

            # get training loss
            cost, kl, AB_batch = session.run([cost_R, KL, r1_weight], feed_dict={bs_ph:bs, x_ph:next_x_data, y_ph:next_y_data, ramp:rmp})

            # get validation loss on test set
            cost_val, kl_val = session.run([cost_R, KL], feed_dict={bs_ph:y_data_test.shape[0], x_ph:x_data_test, y_ph:y_data_test/y_normscale, ramp:rmp})
            plotdata.append([cost,kl,cost+kl,cost_val,kl_val,cost_val+kl_val])

           
            try:
                # Make loss plot
                plt.figure()
                xvec = params['report_interval']*np.arange(np.array(plotdata).shape[0])
                plt.semilogx(xvec,np.array(plotdata)[:,0],label='recon',color='blue',alpha=0.5)
                plt.semilogx(xvec,np.array(plotdata)[:,1],label='KL',color='orange',alpha=0.5)
                plt.semilogx(xvec,np.array(plotdata)[:,2],label='total',color='green',alpha=0.5)
                plt.semilogx(xvec,np.array(plotdata)[:,3],label='recon_val',color='blue',linestyle='dotted')
                plt.semilogx(xvec,np.array(plotdata)[:,4],label='KL_val',color='orange',linestyle='dotted')
                plt.semilogx(xvec,np.array(plotdata)[:,5],label='total_val',color='green',linestyle='dotted')
                plt.ylim([-25,15])
                plt.xlabel('iteration')
                plt.ylabel('cost')
                plt.legend()
                plt.savefig('%s/latest_%s/cost_%s.png' % (params['plot_dir'],params['run_label'],params['run_label']))
                plt.ylim([np.min(np.array(plotdata)[-int(0.9*np.array(plotdata).shape[0]):,0]), np.max(np.array(plotdata)[-int(0.9*np.array(plotdata).shape[0]):,1])])
                plt.savefig('%s/latest_%s/cost_zoom_%s.png' % (params['plot_dir'],params['run_label'],params['run_label']))
                plt.close('all')
                
            except:
                pass

            if params['print_values']==True:
                print('--------------------------------------------------------------')
                print('Iteration:',i)
                print('Training -ELBO:',cost)
                print('Validation -ELBO:',cost_val)
                print('Training KL Divergence:',kl)
                print('Validation KL Divergence:',kl_val)
                print('Training Total cost:',kl + cost) 
                print('Validation Total cost:',kl_val + cost_val)
                print()

                # terminate training if vanishing gradient
                if np.isnan(kl+cost) == True or np.isnan(kl_val+cost_val) == True or kl+cost > int(1e5):
                    print('Network is returning NaN values')
                    print('Terminating network training')
                    if params['hyperparam_optim'] == True:
                        save_path = saver.save(session,save_dir)
                        return 5000.0, session, saver, save_dir
                    else:
                        exit()
                try:
                    # Save loss plot data
                    np.savetxt(save_dir.split('/')[0] + '/loss_data.txt', np.array(plotdata))
                except FileNotFoundError as err:
                    print(err)
                    pass

        if i % params['save_interval'] == 0 and i > 0:

            if params['hyperparam_optim'] == False:
                # Save model 
                save_path = saver.save(session,save_dir)
            else:
                pass


        # stop hyperparam optim training it and return KL divergence as figure of merit
        if params['hyperparam_optim'] == True and i == params['hyperparam_optim_stop']:
            save_path = saver.save(session,save_dir)

            return np.array(plotdata)[-1,2], session, saver, save_dir

        if i % params['plot_interval'] == 0 and i>0:

            n_mode_weight_copy = 100 # must be a multiple of 50
            # just run the network on the test data
            for j in range(params['r']*params['r']):

                # The trained inverse model weights can then be used to infer a probability density of solutions given new measurements
                if params['reduce'] == True or params['n_filters_r1'] != None:
                    XS, dt, _  = VICI_inverse_model.run(params, y_data_test[j].reshape([1,y_data_test.shape[1],y_data_test.shape[2]]), np.shape(x_data_test)[1],
                                                 y_normscale, 
                                                 "inverse_model_dir_%s/inverse_model.ckpt" % params['run_label'])
                else:
                    XS, dt, _  = VICI_inverse_model.run(params, y_data_test[j].reshape([1,-1]), np.shape(x_data_test)[1],
                                                 y_normscale, 
                                                 "inverse_model_dir_%s/inverse_model.ckpt" % params['run_label'])
                print('Runtime to generate {} samples = {} sec'.format(params['n_samples'],dt))            
                # Make corner plots
                # Get corner parnames to use in plotting labels
                parnames = []
                for k_idx,k in enumerate(params['rand_pars']):
                    if np.isin(k, params['inf_pars']):
                        parnames.append(params['cornercorner_parnames'][k_idx])

                defaults_kwargs = dict(
                    bins=50, smooth=0.9, label_kwargs=dict(fontsize=16),
                    title_kwargs=dict(fontsize=16),
                    truth_color='tab:orange', quantiles=[0.16, 0.84],
                    levels=(0.68,0.90,0.95), density=True,
                    plot_density=False, plot_datapoints=True,
                    max_n_ticks=3)

                figure = corner.corner(posterior_truth_test[j], **defaults_kwargs,labels=parnames,
                       color='tab:blue',truths=x_data_test[j,:],
                       show_titles=True)
                # compute weights, otherwise the 1d histograms will be different scales, could remove this
                corner.corner(XS,**defaults_kwargs,labels=parnames,
                       color='tab:red',
                       fill_contours=True,
                       show_titles=True, fig=figure)


                plt.savefig('%s/corner_plot_%s_%d-%d.png' % (params['plot_dir'],params['run_label'],i,j))
                plt.savefig('%s/latest_%s/corner_plot_%s_%d.png' % (params['plot_dir'],params['run_label'],params['run_label'],j))
                plt.close('all')
                print('Made corner plot %d' % j)

    return            
예제 #4
0
def train(params,
          x_data,
          y_data,
          x_data_test,
          y_data_test,
          y_data_test_noisefree,
          y_normscale,
          save_dir,
          truth_test,
          bounds,
          fixed_vals,
          posterior_truth_test,
          snrs_test=None):
    """ Function for training the conditional variational autoencoder (CVAE)
    """

    # USEFUL SIZES
    xsh = np.shape(x_data)

    ysh = np.shape(y_data)[1]
    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights_r1 = params['n_weights_r1']
    n_weights_r2 = params['n_weights_r2']
    n_weights_q = params['n_weights_q']
    n_modes = params['n_modes']
    n_hlayers_r1 = len(params['n_weights_r1'])
    n_hlayers_r2 = len(params['n_weights_r2'])
    n_hlayers_q = len(params['n_weights_q'])
    n_conv_r1 = len(
        params['n_filters_r1']) if params['n_filters_r1'] != None else None
    n_conv_r2 = len(
        params['n_filters_r2']) if params['n_filters_r2'] != None else None
    n_conv_q = len(
        params['n_filters_q']) if params['n_filters_q'] != None else None
    n_filters_r1 = params['n_filters_r1']
    n_filters_r2 = params['n_filters_r2']
    n_filters_q = params['n_filters_q']
    filter_size_r1 = params['filter_size_r1']
    filter_size_r2 = params['filter_size_r2']
    filter_size_q = params['filter_size_q']
    maxpool_r1 = params['maxpool_r1']
    maxpool_r2 = params['maxpool_r2']
    maxpool_q = params['maxpool_q']
    conv_strides_r1 = params['conv_strides_r1']
    conv_strides_r2 = params['conv_strides_r2']
    conv_strides_q = params['conv_strides_q']
    pool_strides_r1 = params['pool_strides_r1']
    pool_strides_r2 = params['pool_strides_r2']
    pool_strides_q = params['pool_strides_q']
    batch_norm = params['batch_norm']
    ysh_conv_r1 = int(ysh)
    ysh_conv_r2 = int(ysh)
    ysh_conv_q = int(ysh)
    drate = params['drate']
    ramp_start = params['ramp_start']
    ramp_end = params['ramp_end']
    num_det = len(fixed_vals['det'])

    # identify the indices of wrapped and non-wrapped parameters - clunky code
    wrap_mask, nowrap_mask, idx_mask = get_wrap_index(params)
    wrap_len = np.sum(wrap_mask)
    nowrap_len = np.sum(nowrap_mask)

    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():

        # PLACE HOLDERS
        bs_ph = tf.placeholder(dtype=tf.int64,
                               name="bs_ph")  # batch size placeholder
        x_ph = tf.placeholder(dtype=tf.float32,
                              shape=[None, xsh[1]],
                              name="x_ph")  # params placeholder
        if n_conv_r1 != None:
            if params['by_channel'] == True:
                y_ph = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, ysh, len(fixed_vals['det'])],
                    name="y_ph")  # data placeholder
            else:
                y_ph = tf.placeholder(
                    dtype=tf.float32,
                    shape=[None, len(fixed_vals['det']), ysh],
                    name="y_ph")
        else:
            y_ph = tf.placeholder(dtype=tf.float32,
                                  shape=[None, ysh],
                                  name="y_ph")  # data placeholder
        idx = tf.placeholder(tf.int32)

        # LOAD VICI NEURAL NETWORKS
        r2_xzy = VICI_decoder.VariationalAutoencoder(
            'VICI_decoder',
            wrap_mask,
            nowrap_mask,
            n_input1=z_dimension,
            n_input2=ysh_conv_r2,
            n_output=xsh[1],
            n_weights=n_weights_r2,
            n_hlayers=n_hlayers_r2,
            drate=drate,
            n_filters=n_filters_r2,
            filter_size=filter_size_r2,
            maxpool=maxpool_r2,
            n_conv=n_conv_r2,
            conv_strides=conv_strides_r2,
            pool_strides=pool_strides_r2,
            num_det=num_det,
            batch_norm=batch_norm,
            by_channel=params['by_channel'],
            weight_init=params['weight_init'])
        r1_zy = VICI_encoder.VariationalAutoencoder(
            'VICI_encoder',
            n_input=ysh_conv_r1,
            n_output=z_dimension,
            n_weights=n_weights_r1,
            n_modes=n_modes,
            n_hlayers=n_hlayers_r1,
            drate=drate,
            n_filters=n_filters_r1,
            filter_size=filter_size_r1,
            maxpool=maxpool_r1,
            n_conv=n_conv_r1,
            conv_strides=conv_strides_r1,
            pool_strides=pool_strides_r1,
            num_det=num_det,
            batch_norm=batch_norm,
            by_channel=params['by_channel'],
            weight_init=params['weight_init'])
        q_zxy = VICI_VAE_encoder.VariationalAutoencoder(
            'VICI_VAE_encoder',
            n_input1=xsh[1],
            n_input2=ysh_conv_q,
            n_output=z_dimension,
            n_weights=n_weights_q,
            n_hlayers=n_hlayers_q,
            drate=drate,
            n_filters=n_filters_q,
            filter_size=filter_size_q,
            maxpool=maxpool_q,
            n_conv=n_conv_q,
            conv_strides=conv_strides_q,
            pool_strides=pool_strides_q,
            num_det=num_det,
            batch_norm=batch_norm,
            by_channel=params['by_channel'],
            weight_init=params['weight_init'])  # used to sample from q(z|x,y)?
        tf.set_random_seed(np.random.randint(0, 10))

        ramp = (tf.log(tf.dtypes.cast(idx, dtype=tf.float32)) -
                tf.log(ramp_start)) / (tf.log(ramp_end) - tf.log(ramp_start))
        ramp = tf.minimum(tf.math.maximum(0.0, ramp), 1.0)

        if params['ramp'] == False:
            ramp = 1.0

        # reduce the y data size
        y_conv = y_ph

        # GET r1(z|y)
        # run inverse autoencoder to generate mean and logvar of z given y data - these are the parameters for r1(z|y)
        r1_loc, r1_scale, r1_weight = r1_zy._calc_z_mean_and_sigma(y_conv)
        r1_scale = tf.sqrt(SMALL_CONSTANT + tf.exp(r1_scale))
        # get l1 loss term
        l1_loss_weight = ramp * 1e-3 * tf.reduce_sum(tf.math.abs(r1_weight), 1)
        r1_weight = ramp * tf.squeeze(r1_weight)

        # define the r1(z|y) mixture model
        bimix_gauss = tfd.MixtureSameFamily(
            mixture_distribution=tfd.Categorical(logits=ramp * r1_weight),
            components_distribution=tfd.MultivariateNormalDiag(
                loc=r1_loc, scale_diag=r1_scale))

        # DRAW FROM r1(z|y) - given the Gaussian parameters generate z samples
        r1_zy_samp = bimix_gauss.sample()

        # GET q(z|x,y)
        q_zxy_mean, q_zxy_log_sig_sq = q_zxy._calc_z_mean_and_sigma(
            x_ph, y_conv)

        # DRAW FROM q(z|x,y)
        q_zxy_samp = q_zxy._sample_from_gaussian_dist(
            bs_ph, z_dimension, q_zxy_mean,
            tf.log(SMALL_CONSTANT + tf.exp(q_zxy_log_sig_sq)))

        # GET r2(x|z,y)
        reconstruction_xzy = r2_xzy.calc_reconstruction(q_zxy_samp, y_conv)
        r2_xzy_mean_nowrap = reconstruction_xzy[0]
        r2_xzy_log_sig_sq_nowrap = reconstruction_xzy[1]
        if np.sum(wrap_mask) > 0:
            r2_xzy_mean_wrap = reconstruction_xzy[2]
            r2_xzy_log_sig_sq_wrap = reconstruction_xzy[3]

        # COST FROM RECONSTRUCTION - Gaussian parts
        normalising_factor_x = -0.5 * tf.log(
            SMALL_CONSTANT + tf.exp(r2_xzy_log_sig_sq_nowrap)) - 0.5 * np.log(
                2.0 * np.pi)  # -0.5*log(sig^2) - 0.5*log(2*pi)
        square_diff_between_mu_and_x = tf.square(
            r2_xzy_mean_nowrap -
            tf.boolean_mask(x_ph, nowrap_mask, axis=1))  # (mu - x)^2

        inside_exp_x = -0.5 * tf.divide(
            square_diff_between_mu_and_x, SMALL_CONSTANT +
            tf.exp(r2_xzy_log_sig_sq_nowrap))  # -0.5*(mu - x)^2 / sig^2
        reconstr_loss_x = tf.reduce_sum(
            normalising_factor_x + inside_exp_x, axis=1, keepdims=True
        )  # sum_dim(-0.5*log(sig^2) - 0.5*log(2*pi) - 0.5*(mu - x)^2 / sig^2)

        # COST FROM RECONSTRUCTION - Von Mises parts
        if np.sum(wrap_mask) > 0:
            con = tf.reshape(
                tf.math.reciprocal(SMALL_CONSTANT +
                                   tf.exp(r2_xzy_log_sig_sq_wrap)),
                [-1, wrap_len
                 ])  # modelling wrapped scale output as log variance
            von_mises = tfp.distributions.VonMises(
                loc=2.0 * np.pi *
                (tf.reshape(r2_xzy_mean_wrap, [-1, wrap_len]) - 0.5),
                concentration=con)  # define p_vm(2*pi*mu,con=1/sig^2)
            reconstr_loss_vm = tf.reduce_sum(
                von_mises.log_prob(
                    2.0 * np.pi *
                    (tf.reshape(tf.boolean_mask(x_ph, wrap_mask, axis=1),
                                [-1, wrap_len]) - 0.5)),
                axis=1)  # 2pi is the von mises input range
            cost_R = -1.0 * tf.reduce_mean(
                reconstr_loss_x + reconstr_loss_vm)  # average over batch
            r2_xzy_mean = tf.gather(tf.concat(
                [r2_xzy_mean_nowrap, r2_xzy_mean_wrap], axis=1),
                                    tf.constant(idx_mask),
                                    axis=1)
            r2_xzy_scale = tf.gather(tf.concat(
                [r2_xzy_log_sig_sq_nowrap, r2_xzy_log_sig_sq_wrap], axis=1),
                                     tf.constant(idx_mask),
                                     axis=1)
        else:
            cost_R = -1.0 * tf.reduce_mean(reconstr_loss_x)
            r2_xzy_mean = r2_xzy_mean_nowrap
            r2_xzy_scale = r2_xzy_log_sig_sq_nowrap

        # compute montecarlo KL - first compute the analytic self entropy of q
        normalising_factor_kl = -0.5 * tf.log(
            SMALL_CONSTANT + tf.exp(q_zxy_log_sig_sq)) - 0.5 * np.log(
                2.0 * np.pi)  # -0.5*log(sig^2) - 0.5*log(2*pi)
        square_diff_between_qz_and_q = tf.square(q_zxy_mean -
                                                 q_zxy_samp)  # (mu - x)^2
        inside_exp_q = -0.5 * tf.divide(
            square_diff_between_qz_and_q, SMALL_CONSTANT +
            tf.exp(q_zxy_log_sig_sq))  # -0.5*(mu - x)^2 / sig^2
        log_q_q = tf.reduce_sum(
            normalising_factor_kl + inside_exp_q, axis=1, keepdims=True
        )  # sum_dim(-0.5*log(sig^2) - 0.5*log(2*pi) - 0.5*(mu - x)^2 / sig^2)
        log_r1_q = bimix_gauss.log_prob(
            q_zxy_samp)  # evaluate the log prob of r1 at the q samples
        KL = tf.reduce_mean(log_q_q - log_r1_q)  # average over batch

        # THE VICI COST FUNCTION
        COST = cost_R + ramp * KL

        # VARIABLES LISTS
        var_list_VICI = [
            var for var in tf.trainable_variables()
            if var.name.startswith("VICI")
        ]

        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate'])
        minimize = optimizer.minimize(COST, var_list=var_list_VICI)

        # INITIALISE AND RUN SESSION
        init = tf.global_variables_initializer()
        session.run(init)
        saver = tf.train.Saver()

    print('Training Inference Model...')
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(
        params['batch_size'], xsh[0])
    plotdata = []
    if params['by_channel'] == False:
        y_data_test_new = []
        for sig in y_data_test:
            y_data_test_new.append(sig.T)
        y_data_test = np.array(y_data_test_new)
        del y_data_test_new

    load_chunk_it = 1
    for i in range(params['num_iterations']):

        next_indices = indices_generator.next_indices()

        # if load chunks true, load in data by chunks
        if params['load_by_chunks'] == True and i == int(
                params['load_iteration'] * load_chunk_it):
            x_data, y_data = load_chunk(params['train_set_dir'],
                                        params['inf_pars'], params, bounds,
                                        fixed_vals)
            load_chunk_it += 1

        # Make noise realizations and add to training data
        next_x_data = x_data[next_indices, :]
        if n_conv_r1 != None:
            next_y_data = y_data[next_indices, :] + np.random.normal(
                0,
                1,
                size=(params['batch_size'], int(
                    params['ndata']), len(fixed_vals['det'])))
        else:
            next_y_data = y_data[next_indices, :] + np.random.normal(
                0,
                1,
                size=(params['batch_size'],
                      int(params['ndata'] * len(fixed_vals['det']))))
        next_y_data /= y_normscale  # required for fast convergence

        if params['by_channel'] == False:
            next_y_data_new = []
            for sig in next_y_data:
                next_y_data_new.append(sig.T)
            next_y_data = np.array(next_y_data_new)
            del next_y_data_new

        # restore session if wanted
        if params['resume_training'] == True and i == 0:
            print(save_dir)
            saver.restore(session, save_dir)

        # train to minimise the cost function
        session.run(minimize,
                    feed_dict={
                        bs_ph: bs,
                        x_ph: next_x_data,
                        y_ph: next_y_data,
                        idx: i
                    })

        # if we are in a report iteration extract cost function values
        if i % params['report_interval'] == 0 and i > 0:

            # get training loss
            cost, kl, AB_batch = session.run([cost_R, KL, r1_weight],
                                             feed_dict={
                                                 bs_ph: bs,
                                                 x_ph: next_x_data,
                                                 y_ph: next_y_data,
                                                 idx: i
                                             })

            # get validation loss on test set
            cost_val, kl_val = session.run(
                [cost_R, KL],
                feed_dict={
                    bs_ph: y_data_test.shape[0],
                    x_ph: x_data_test,
                    y_ph: y_data_test / y_normscale,
                    idx: i
                })
            plotdata.append(
                [cost, kl, cost + kl, cost_val, kl_val, cost_val + kl_val])

            try:
                # Make loss plot
                plt.figure()
                xvec = params['report_interval'] * np.arange(
                    np.array(plotdata).shape[0])
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 0],
                             label='recon',
                             color='blue',
                             alpha=0.5)
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 1],
                             label='KL',
                             color='orange',
                             alpha=0.5)
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 2],
                             label='total',
                             color='green',
                             alpha=0.5)
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 3],
                             label='recon_val',
                             color='blue',
                             linestyle='dotted')
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 4],
                             label='KL_val',
                             color='orange',
                             linestyle='dotted')
                plt.semilogx(xvec,
                             np.array(plotdata)[:, 5],
                             label='total_val',
                             color='green',
                             linestyle='dotted')
                plt.ylim([-25, 15])
                plt.xlabel('iteration')
                plt.ylabel('cost')
                plt.legend()
                plt.savefig('%s/latest_%s/cost_%s.png' %
                            (params['plot_dir'], params['run_label'],
                             params['run_label']))
                plt.ylim([
                    np.min(
                        np.array(plotdata)[-int(0.9 *
                                                np.array(plotdata).shape[0]):,
                                           0]),
                    np.max(
                        np.array(plotdata)[-int(0.9 *
                                                np.array(plotdata).shape[0]):,
                                           1])
                ])
                plt.savefig('%s/latest_%s/cost_zoom_%s.png' %
                            (params['plot_dir'], params['run_label'],
                             params['run_label']))
                plt.close('all')

            except:
                pass

            if params['print_values'] == True:
                print(
                    '--------------------------------------------------------------'
                )
                print('Iteration:', i)
                print('Training -ELBO:', cost)
                print('Validation -ELBO:', cost_val)
                print('Training KL Divergence:', kl)
                print('Validation KL Divergence:', kl_val)
                print('Training Total cost:', kl + cost)
                print('Validation Total cost:', kl_val + cost_val)
                print()

                # terminate training if vanishing gradient
                if np.isnan(kl + cost) == True or np.isnan(kl_val +
                                                           cost_val) == True:
                    print('Network is returning NaN values')
                    print('Terminating network training')
                    if params['hyperparam_optim'] == True:
                        save_path = saver.save(session, save_dir)
                        return 5000.0, session, saver, save_dir
                    else:
                        exit()
                try:
                    # Save loss plot data
                    np.savetxt(
                        save_dir.split('/')[0] + '/loss_data.txt',
                        np.array(plotdata))
                except FileNotFoundError as err:
                    print(err)
                    pass

        if i % params['save_interval'] == 0 and i > 0:

            if params['hyperparam_optim'] == False:
                # Save model
                save_path = saver.save(session, save_dir)
            else:
                pass

        # stop hyperparam optim training it and return total loss as figure of merit
        if params['hyperparam_optim'] == True and i == params[
                'hyperparam_optim_stop']:
            save_path = saver.save(session, save_dir)

            return np.array(plotdata)[-1, 2], session, saver, save_dir

        if i % params['plot_interval'] == 0 and i > 0:

            n_mode_weight_copy = 100  # must be a multiple of 50
            # just run the network on the test data
            for j in range(params['r'] * params['r']):

                # The trained inverse model weights can then be used to infer a probability density of solutions given new measurements
                if params['n_filters_r1'] != None:
                    XS, loc, scale, dt, _ = VICI_inverse_model.run(
                        params, y_data_test[j].reshape(
                            [1, y_data_test.shape[1], y_data_test.shape[2]]),
                        np.shape(x_data_test)[1], y_normscale, save_dir)
                else:
                    XS, loc, scale, dt, _ = VICI_inverse_model.run(
                        params, y_data_test[j].reshape([1, -1]),
                        np.shape(x_data_test)[1], y_normscale, save_dir)
                print('Runtime to generate {} samples = {} sec'.format(
                    params['n_samples'], dt))

                # Get corner parnames to use in plotting labels
                parnames = []
                for k_idx, k in enumerate(params['rand_pars']):
                    if np.isin(k, params['inf_pars']):
                        parnames.append(params['cornercorner_parnames'][k_idx])

                defaults_kwargs = dict(bins=50,
                                       smooth=0.9,
                                       label_kwargs=dict(fontsize=16),
                                       title_kwargs=dict(fontsize=16),
                                       truth_color='tab:orange',
                                       quantiles=[0.16, 0.84],
                                       levels=(0.68, 0.90, 0.95),
                                       density=True,
                                       plot_density=False,
                                       plot_datapoints=True,
                                       max_n_ticks=3)

                figure = corner.corner(posterior_truth_test[j],
                                       **defaults_kwargs,
                                       labels=parnames,
                                       color='tab:blue',
                                       truths=x_data_test[j, :],
                                       show_titles=True)
                corner.corner(XS,
                              **defaults_kwargs,
                              labels=parnames,
                              color='tab:red',
                              fill_contours=True,
                              show_titles=True,
                              fig=figure)

                plt.savefig('%s/corner_plot_%s_%d-%d.png' %
                            (params['plot_dir'], params['run_label'], i, j))
                plt.savefig('%s/latest_%s/corner_plot_%s_%d.png' %
                            (params['plot_dir'], params['run_label'],
                             params['run_label'], j))
                plt.close('all')
                plt.show()
                print('Made corner plot %d' % j)
    return
예제 #5
0
def train(params, x_data, y_data, siz_high_res, save_dir, plotter, y_data_test,train_files,normscales,y_data_train_noisefree,samples,pos_test,y_normscale):    

    x_data = x_data
    y_data_train_l = y_data

    # USEFUL SIZES
    xsh = np.shape(x_data)
    yshl1 = np.shape(y_data)[1]
    ysh1 = np.shape(y_data)[1]    
    print(xsh,yshl1,ysh1)
 
    #z_dimension_fm = params['z_dimensions_fw']
    #n_weights_fm = params['n_weights_fw']
    
    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights = params['n_weights']
    lam = 1
    
    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():
        tf.set_random_seed(np.random.randint(0,10))
        SMALL_CONSTANT = 1e-6
        
        # PLACE HOLDERS
        x_ph = tf.placeholder(dtype=tf.float32, shape=[None, xsh[1]], name="x_ph")
        bs_ph = tf.placeholder(dtype=tf.int64, name="bs_ph") # batch size placeholder
        yt_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="yt_ph")
        
        # LOAD FORWARD MODEL NEURAL NETWORKS
        #DEC_XYlZtoYh = OELBO_decoder_difference.VariationalAutoencoder("OELBO_decoder", ysh1, z_dimension_fm+yshl1+xsh[1], n_weights_fm) # p(Yh|X,Yl,Z)
        #ENC_XYltoZ = OELBO_encoder.VariationalAutoencoder("OELBO_encoder", yshl1+xsh[1], z_dimension_fm, n_weights_fm) # p(Z|X,Yl)
        #ENC_XYhYltoZ = VAE_encoder.VariationalAutoencoder("vae_encoder", xsh[1]+ysh1+yshl1, z_dimension_fm, n_weights_fm) # q(Z|X,Yl,Yh)
        
        # LOAD VICI NEURAL NETWORKS
        autoencoder = VICI_decoder.VariationalAutoencoder("VICI_decoder", xsh[1], z_dimension+ysh1, n_weights) # r(x|z,y)
        autoencoder_ENC = VICI_encoder.VariationalAutoencoder("VICI_encoder", ysh1, z_dimension, n_weights) # generates params for r(z|y)
        autoencoder_VAE = VICI_VAE_encoder.VariationalAutoencoder("VICI_VAE_encoder", xsh[1]+ysh1, z_dimension, n_weights) # used to sample from r(z|y)?
        
        # DEFINE MULTI-FIDELITY FORWARD MODEL
        #####################################################################################################################
        SMALL_CONSTANT = 1e-6
        
        # NORMALISE INPUTS
        yl_ph_n = tf_normalise_dataset(yt_ph) # placeholder for normalised low-res y data
        x_ph_n = tf_normalise_dataset(x_ph)   # placeholder for normalised x data
        #yl_ph_n = yt_ph
        #x_ph_n = x_ph
        #yl_ph_n = tf.Print(yl_ph_n, [yl_ph_n], first_n=1, summarize=10, message="Thss is yl_ph_n: ")
        #x_ph_n = tf.Print(x_ph_n, [x_ph_n], first_n=1, summarize=10, message="This is x_ph_n: ")

        # GET p(Z|X,Yl) - takes in x data and low res y data and returns mean and logvar of Gaussian z distribution
        #zxyl_mean,zxyl_log_sig_sq = ENC_XYltoZ._calc_z_mean_and_sigma(tf.concat([x_ph_n,yl_ph_n],1))
        #zxyl_mean = tf.Print(zxyl_mean, [zxyl_mean], first_n=1, summarize=10, message="Thss is zxyl_mean: ")
        #zxyl_log_sig_sq = tf.Print(zxyl_log_sig_sq, [zxyl_log_sig_sq], first_n=1, summarize=10, message="Thss is zxyl_log_sig_sq: ")
        # then samples z from that distribution
        #rxyl_samp = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(x_ph_n)[0], z_dimension_fm, zxyl_mean, tf.log(tf.exp(zxyl_log_sig_sq)+SMALL_CONSTANT))
        #rxyl_samp = tf.Print(rxyl_samp, [rxyl_samp], first_n=1, summarize=10, message="Thss is rxyl_samp: ")        

        # GET p(Yh|X,Yl,Z) FROM SAMPLES Z ~ p(Z|X,Yl)
        # then decodes back to high res y data 
        #reconstruction_yh = DEC_XYlZtoYh.calc_reconstruction(tf.concat([x_ph_n,yl_ph_n,rxyl_samp],1))
        # = tf.Print(, [], first_n=1, summarize=10, message="Thss is : ")
        #reconstruction_yh = tf.Print(reconstruction_yh, [reconstruction_yh], first_n=1, summarize=10, message="Thss is reconstruction_yh: ")
        #yh_diff = reconstruction_yh[0] # looks like the difference between the low res y data and the mean reconstructed high res y data
        #yh_diff = tf.Print(yh_diff, [yh_diff], first_n=1, summarize=10, message="Thss is yh_diff: ") 
        #yh_mean = yl_ph_n+yh_diff      # the mean reconstrcted high res data
        #yh_mean = tf.Print(yh_mean, [yh_mean], first_n=1, summarize=10, message="Thss is yh_mean: ")
        #yh_log_sig_sq = reconstruction_yh[1]  # the reconstructed high res y data logvar
        #yh_log_sig_sq = tf.Print(yh_log_sig_sq, [yh_log_sig_sq], first_n=1, summarize=10, message="Thss is yh_log_sig_sq: ")
        # then sample something? y doesn't seem to be used anywhere after this 
        # this ends up being the reconstruction of the exact y data corresponding to each x data input
        #y = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(yt_ph)[0], tf.shape(yt_ph)[1], yh_mean, yh_log_sig_sq)
        #y = tf.Print(y, [y], first_n=1, summarize=10, message="Thss is y: ")        

        # DRAW SYNTHETIC Ys TRAINING DATA
#        _, y = OM.forward_model(x_ph_amp,x_ph_ph)
        
        ##########################################################################################################################################
        
        # GET r(z|y)
        #y_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="y_ph")  # placeholder for y data
        #y_ph_n = tf_normalise_dataset(y_ph)                                       # placeholder for normalised y data
        #y_ph = tf.Print(y_ph, [y_ph], first_n=1, summarize=10, message="Thss is y_ph: ")
        #y_ph_n = y_ph
        # run inverse autoencoder to generate mean and logvar of z given y data - these are the parameters for r(z|y)
        #zy_mean,zy_log_sig_sq = autoencoder_ENC._calc_z_mean_and_sigma(y_ph_n) 
        zy_mean,zy_log_sig_sq = autoencoder_ENC._calc_z_mean_and_sigma(yl_ph_n)        

        # DRAW FROM r(z|y) - given the Gaussian parameters generate z samples
        rzy_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zy_mean, zy_log_sig_sq)
        
        # GET r(x|z,y) from r(z|y) samples
        #rzy_samp_y = tf.concat([rzy_samp,y_ph_n],1)
        rzy_samp_y = tf.concat([rzy_samp,yl_ph_n],1)
        reconstruction_xzy = autoencoder.calc_reconstruction(rzy_samp_y)
        x_mean = reconstruction_xzy[0]
        x_log_sig_sq = reconstruction_xzy[1]
        
        # KL(r(z|y)||p(z))
        #latent_loss = -0.5 * tf.reduce_sum(1 + zy_log_sig_sq - tf.square(zy_mean) - tf.exp(zy_log_sig_sq), 1)
        #KL = tf.reduce_mean(latent_loss)
       
        # GET q(z|x,y)
        #xy_ph = tf.concat([x_ph_n,y_ph_n],1)
        xy_ph = tf.concat([x_ph_n,yl_ph_n],1)
        zx_mean,zx_log_sig_sq = autoencoder_VAE._calc_z_mean_and_sigma(xy_ph)
 
        # DRAW FROM q(z|x,y)
        qzx_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zx_mean, zx_log_sig_sq)
        
        # GET r(x|z,y)
        #qzx_samp_y = tf.concat([qzx_samp,y_ph_n],1)
        qzx_samp_y = tf.concat([qzx_samp,yl_ph_n],1)
        reconstruction_xzx = autoencoder.calc_reconstruction(qzx_samp_y)
        x_mean_vae = reconstruction_xzx[0]
        x_log_sig_sq_vae = reconstruction_xzx[1]
        
        # COST FROM RECONSTRUCTION
        normalising_factor_x_vae = - 0.5 * tf.log(SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae)) - 0.5 * np.log(2 * np.pi)
        square_diff_between_mu_and_x_vae = tf.square(x_mean_vae - x_ph_n)
        inside_exp_x_vae = -0.5 * tf.div(square_diff_between_mu_and_x_vae,SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae))
        reconstr_loss_x_vae = -tf.reduce_sum(normalising_factor_x_vae + inside_exp_x_vae, 1)
        cost_R_vae = tf.reduce_mean(reconstr_loss_x_vae)
        
        # KL(q(z|x,y)||r(z|y))
        v_mean = zy_mean #2
        aux_mean = zx_mean #1
        v_log_sig_sq = tf.log(tf.exp(zy_log_sig_sq)+SMALL_CONSTANT) #2
        aux_log_sig_sq = tf.log(tf.exp(zx_log_sig_sq)+SMALL_CONSTANT) #1
        v_log_sig = tf.log(tf.sqrt(tf.exp(v_log_sig_sq))) #2
        aux_log_sig = tf.log(tf.sqrt(tf.exp(aux_log_sig_sq))) #1
        cost_VAE_a = v_log_sig-aux_log_sig+tf.divide(tf.exp(aux_log_sig_sq)+tf.square(aux_mean-v_mean),2*tf.exp(v_log_sig_sq))-0.5
        cost_VAE_b = tf.reduce_sum(cost_VAE_a,1)                           
        KL_vae = tf.reduce_mean(cost_VAE_b)                               # computes the mean over all tensor elements
        
        # THE VICI COST FUNCTION
        lam_ph = tf.placeholder(dtype=tf.float32, name="lam_ph")
        COST_VAE = KL_vae+cost_R_vae
        COST = COST_VAE
        
        # VARIABLES LISTS
        var_list_VICI = [var for var in tf.trainable_variables() if var.name.startswith("VICI")]
        var_list_ELBO = [var for var in tf.trainable_variables() if var.name.startswith("ELBO")]
        
        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate']) 
        minimize = optimizer.minimize(COST,var_list = var_list_VICI)
        
        # DRAW FROM q(x|y)
        qx_samp = autoencoder_ENC._sample_from_gaussian_dist(bs_ph, xsh[1], x_mean, SMALL_CONSTANT + tf.log(tf.exp(x_log_sig_sq)))
        
        # INITIALISE AND RUN SESSION
#        init = tf.variables_initializer(var_list_VICI)
        #init = tf.initialize_all_variables()
        init = tf.global_variables_initializer()
        session.run(init)
        #saver_ELBO = tf.train.Saver(var_list_ELBO)
        #saver_ELBO.restore(session,load_dir)
        saver = tf.train.Saver()
    
    KL_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test OELBO values
    COST_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test VAE ELBO values

    print('Training Inference Model...')    
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(params['batch_size'], xsh[0])
    ni = -1
    test_n = 100
    olvec = []
    for i in range(params['num_iterations']):

        next_indices = indices_generator.next_indices()
        
        # run the session - input batchsize the x-data training batch and the y-data training batch 
        #yn = session.run(y, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :], yt_ph:y_data_train_l[next_indices, :]})
        #session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :],  y_ph:yn, lam_ph:lam, yt_ph:y_data_train_l[next_indices, :]}) # minimising cost function

        # Make 25 noise realizations
        if params['do_extra_noise']:
            x_data_train_l = x_data[next_indices,:]
            y_data_train_l = y_data_train_noisefree[next_indices,:] + np.random.normal(0,1,size=(params['batch_size'],params['ndata']))
            if params['do_normscale']:
                y_data_train_l /= y_normscale[0]
            #print('generated {} elements of new training data noise'.format(params['batch_size']))

            session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data_train_l, lam_ph:lam, yt_ph:y_data_train_l}) # minimising cost function
        else:
            session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :], lam_ph:lam, yt_ph:y_data_train_l[next_indices, :]}) # minimising cost function

        if i % params['report_interval'] == 0 and i > 0:
            ni = ni+1
                
            #ynt = session.run(y, feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], yt_ph:y_data_train_l[0:test_n,:]})
            #cost_value_vae, KL_VAE = session.run([COST_VAE, KL_vae], feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], y_ph:ynt, lam_ph:lam, yt_ph:y_data_train_l[0:test_n,:]})
            cost_value_vae, KL_VAE = session.run([cost_R_vae, KL_vae], feed_dict={bs_ph:test_n, x_ph:x_data_train_l[0:test_n,:], lam_ph:lam, yt_ph:y_data_train_l[0:test_n,:]})
            KL_PLOT[ni] = KL_VAE
            COST_PLOT[ni] = cost_value_vae

            # plot losses
            plotter.make_loss_plot(COST_PLOT[:ni+1],KL_PLOT[:ni+1],params['report_interval'],fwd=False)

            if params['print_values']==True:
                print('--------------------------------------------------------------')
                print('Iteration:',i)
                print('Training Set -ELBO:',cost_value_vae)
                print('KL Divergence:',KL_VAE)

        if i % params['plot_interval'] == 0 and i>0:
            # The trained inverse model weights can then be used to infer a probability density of solutions given new measurements
            _, _, XS, _, _  = VICI_inverse_model.run(params, y_data_test, np.shape(x_data)[1], "inverse_model_dir_%s/inverse_model.ckpt" % params['run_label'])

            # Convert XS back to unnormalized version
            if params['do_normscale']:
                for m in range(params['ndim_x']):
                    XS[:,m,:] = XS[:,m,:]*normscales[m]

            # Generate final results plots
            plotter = plots.make_plots(params,samples,XS,pos_test)

            # Make corner plots
            plotter.make_corner_plot(sampler='dynesty1')

            # Make KL plot
#            plotter.gen_kl_plots(VICI_inverse_model,y_data_test,x_data,normscales)

            # Make pp plot
#            plotter.plot_pp(VICI_inverse_model,y_data_train_l,x_data_train,0,normscales)
       
        if i % params['save_interval'] == 0 and i > 0:

            # Save model 
            save_path = saver.save(session,save_dir)
                
                
    return COST_PLOT, KL_PLOT, train_files 
예제 #6
0
def resume_training(params, x_data, y_data_l, siz_high_res, save_dir, train_files,normscales,y_data_train_noisefree,y_normscale):    

    x_data = x_data
    y_data_train_l = y_data_l
    
    # USEFUL SIZES
    xsh = np.shape(x_data)
    yshl1 = np.shape(y_data_l)[1]
    ysh1 = siz_high_res
    
    #z_dimension_fm = params['z_dimensions_fw']
    #n_weights_fm = params['n_weights_fw']
    
    z_dimension = params['z_dimension']
    bs = params['batch_size']
    n_weights = params['n_weights']
    lam = 1
    
    # Allow GPU growth 
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    graph = tf.Graph()
    session = tf.Session(graph=graph,config=config)
    with graph.as_default():
        tf.set_random_seed(np.random.randint(0,10))
        SMALL_CONSTANT = 1e-6
        
        # PLACE HOLDERS
        x_ph = tf.placeholder(dtype=tf.float32, shape=[None, xsh[1]], name="x_ph")
        bs_ph = tf.placeholder(dtype=tf.int64, name="bs_ph") # batch size placeholder
        yt_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="yt_ph")
        
        # LOAD FORWARD MODEL NEURAL NETWORKS
        #DEC_XYlZtoYh = OELBO_decoder_difference.VariationalAutoencoder("OELBO_decoder", ysh1, z_dimension_fm+yshl1+xsh[1], n_weights_fm) # p(Yh|X,Yl,Z)
        #ENC_XYltoZ = OELBO_encoder.VariationalAutoencoder("OELBO_encoder", yshl1+xsh[1], z_dimension_fm, n_weights_fm) # p(Z|X,Yl)
        #ENC_XYhYltoZ = VAE_encoder.VariationalAutoencoder("vae_encoder", xsh[1]+ysh1+yshl1, z_dimension_fm, n_weights_fm) # q(Z|X,Yl,Yh)
        
        # LOAD VICI NEURAL NETWORKS
        autoencoder = VICI_decoder.VariationalAutoencoder("VICI_decoder", xsh[1], z_dimension+ysh1, n_weights)
        autoencoder_ENC = VICI_encoder.VariationalAutoencoder("VICI_encoder", ysh1, z_dimension, n_weights)
        autoencoder_VAE = VICI_VAE_encoder.VariationalAutoencoder("VICI_VAE_encoder", xsh[1]+ysh1, z_dimension, n_weights)
        
        # DEFINE MULTI-FIDELITY FORWARD MODEL
        #####################################################################################################################
        SMALL_CONSTANT = 1e-6
        
        # NORMALISE INPUTS
        yl_ph_n = tf_normalise_dataset(yt_ph)
        x_ph_n = tf_normalise_dataset(x_ph)
        #yl_ph_n = yt_ph
        #x_ph_n = x_ph
        
        # GET p(Z|X,Yl)
        #zxyl_mean,zxyl_log_sig_sq = ENC_XYltoZ._calc_z_mean_and_sigma(tf.concat([x_ph_n,yl_ph_n],1))
        #rxyl_samp = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(x_ph_n)[0], z_dimension_fm, zxyl_mean, tf.log(tf.exp(zxyl_log_sig_sq)+SMALL_CONSTANT))
        
        # GET p(Yh|X,Yl,Z) FROM SAMPLES Z ~ p(Z|X,Yl)
        #reconstruction_yh = DEC_XYlZtoYh.calc_reconstruction(tf.concat([x_ph_n,yl_ph_n,rxyl_samp],1))
        #yh_diff = reconstruction_yh[0]
        #yh_mean = yl_ph_n+yh_diff
        #yh_log_sig_sq = reconstruction_yh[1]
        #y = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(yt_ph)[0], tf.shape(yt_ph)[1], yh_mean, yh_log_sig_sq)
        
        # DRAW SYNTHETIC Ys TRAINING DATA
#        _, y = OM.forward_model(x_ph_amp,x_ph_ph)
        
        ##########################################################################################################################################
        
        # GET r(z|y)
        #y_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh1], name="y_ph")
        #y_ph_n = tf_normalise_dataset(y_ph)
        #y_ph_n = y_ph
        zy_mean,zy_log_sig_sq = autoencoder_ENC._calc_z_mean_and_sigma(yl_ph_n)
        
        # DRAW FROM r(z|y)
        rzy_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zy_mean, zy_log_sig_sq)
        
        # GET r(x|z,y) from r(z|y) samples
        rzy_samp_y = tf.concat([rzy_samp,yl_ph_n],1)
        reconstruction_xzy = autoencoder.calc_reconstruction(rzy_samp_y)
        x_mean = reconstruction_xzy[0]
        x_log_sig_sq = reconstruction_xzy[1]
        
        # KL(r(z|y)||p(z))
        latent_loss = -0.5 * tf.reduce_sum(1 + zy_log_sig_sq - tf.square(zy_mean) - tf.exp(zy_log_sig_sq), 1)
        KL = tf.reduce_mean(latent_loss)
        
        # GET q(z|x,y)
        xy_ph = tf.concat([x_ph_n,yl_ph_n],1)
        zx_mean,zx_log_sig_sq = autoencoder_VAE._calc_z_mean_and_sigma(xy_ph)
        
        # DRAW FROM q(z|x,y)
        qzx_samp = autoencoder_VAE._sample_from_gaussian_dist(bs_ph, z_dimension, zx_mean, zx_log_sig_sq)
        
        # GET r(x|z,y)
        qzx_samp_y = tf.concat([qzx_samp,yl_ph_n],1)
        reconstruction_xzx = autoencoder.calc_reconstruction(qzx_samp_y)
        x_mean_vae = reconstruction_xzx[0]
        x_log_sig_sq_vae = reconstruction_xzx[1]
        
        # COST FROM RECONSTRUCTION
        normalising_factor_x_vae = - 0.5 * tf.log(SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae)) - 0.5 * np.log(2 * np.pi)
        square_diff_between_mu_and_x_vae = tf.square(x_mean_vae - x_ph_n)
        inside_exp_x_vae = -0.5 * tf.div(square_diff_between_mu_and_x_vae,SMALL_CONSTANT+tf.exp(x_log_sig_sq_vae))
        reconstr_loss_x_vae = -tf.reduce_sum(normalising_factor_x_vae + inside_exp_x_vae, 1) # Take sum along axis 1
        cost_R_vae = tf.reduce_mean(reconstr_loss_x_vae) # Take mean along the diagonal
        
        # KL(q(z|x,y)||r(z|y))
        v_mean = zy_mean #2
        aux_mean = zx_mean #1
        v_log_sig_sq = tf.log(tf.exp(zy_log_sig_sq)+SMALL_CONSTANT) #2
        aux_log_sig_sq = tf.log(tf.exp(zx_log_sig_sq)+SMALL_CONSTANT) #1
        v_log_sig = tf.log(tf.sqrt(tf.exp(v_log_sig_sq))) #2
        aux_log_sig = tf.log(tf.sqrt(tf.exp(aux_log_sig_sq))) #1
        cost_VAE_a = v_log_sig-aux_log_sig+tf.divide(tf.exp(aux_log_sig_sq)+tf.square(aux_mean-v_mean),2*tf.exp(v_log_sig_sq))-0.5
        cost_VAE_b = tf.reduce_sum(cost_VAE_a,1)
        KL_vae = tf.reduce_mean(cost_VAE_b)
        
        # THE VICI COST FUNCTION
        lam_ph = tf.placeholder(dtype=tf.float32, name="lam_ph")
        COST_VAE = KL_vae+cost_R_vae
        COST = COST_VAE
        
        # VARIABLES LISTS
        var_list_VICI = [var for var in tf.trainable_variables() if var.name.startswith("VICI")]
        var_list_ELBO = [var for var in tf.trainable_variables() if var.name.startswith("ELBO")]
        
        # DEFINE OPTIMISER (using ADAM here)
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate']) 
        minimize = optimizer.minimize(COST,var_list = var_list_VICI)
        
        # DRAW FROM q(x|y)
        qx_samp = autoencoder_ENC._sample_from_gaussian_dist(bs_ph, xsh[1], x_mean, SMALL_CONSTANT + tf.log(tf.exp(x_log_sig_sq)))
        
        # INITIALISE AND RUN SESSION
#        init = tf.variables_initializer(var_list_VICI)
        init = tf.initialize_all_variables()
        session.run(init)
        #saver_ELBO = tf.train.Saver(var_list_ELBO)
        #saver_ELBO.restore(session,load_dir)
        saver = tf.train.Saver(var_list_VICI)
        saver.restore(session,save_dir)
    
    KL_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test OELBO values
    COST_PLOT = np.zeros(np.int(np.round(params['num_iterations']/params['report_interval'])+1)) # vector to store test VAE ELBO values
    
    print('Training Inference Model...')    
    # START OPTIMISATION OF OELBO
    indices_generator = batch_manager.SequentialIndexer(params['batch_size'], xsh[0])
    ni = -1
    test_n = 100
    for i in range(params['num_iterations']):
        
        next_indices = indices_generator.next_indices()

        # Make 25 noise realizations
        if params['do_extra_noise']:
            x_data_train_l = x_data[next_indices,:]
            y_data_train_l = y_data_train_noisefree[next_indices,:] + np.random.normal(0,1,size=(params['batch_size'],params['ndata']))
            y_data_train_l /= y_normscale[0]
            #print('generated {} elements of new training data noise'.format(params['batch_size']))

            session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data_train_l, lam_ph:lam, yt_ph:y_data_train_l}) # minimising cost function
        else:       
            #yn = session.run(y, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :], yt_ph:y_data_train_l[next_indices, :]})
            #session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :],  y_ph:yn, lam_ph:lam, yt_ph:y_data_train_l[next_indices, :]}) # minimising cost function
            session.run(minimize, feed_dict={bs_ph:bs, x_ph:x_data[next_indices, :], lam_ph:lam, yt_ph:y_data_train_l[next_indices, :]}) # minimising cost function
        
        if i % params['report_interval'] == 0:
                ni = ni+1
                
                #ynt = session.run(y, feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], yt_ph:y_data_train_l[0:test_n,:]})
                #cost_value_vae, KL_VAE = session.run([COST_VAE, KL_vae], feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], y_ph:ynt, lam_ph:lam, yt_ph:y_data_train_l[0:test_n,:]})
                cost_value_vae, KL_VAE = session.run([COST_VAE, KL_vae], feed_dict={bs_ph:test_n, x_ph:x_data[0:test_n,:], lam_ph:lam, yt_ph:y_data_train_l[0:test_n,:]})
                KL_PLOT[ni] = KL_VAE
                COST_PLOT[ni] = -cost_value_vae

                # plot losses
                plotter.make_loss_plot(COST_PLOT[:ni+1],KL_PLOT[:ni+1],params['report_interval'],fwd=False)
                
                if params['print_values']==True:
                    print('--------------------------------------------------------------')
                    print('Iteration:',i)
                    print('Training Set ELBO:',-cost_value_vae)
                    print('KL Divergence:',KL_VAE)
       
        if i % params['plot_interval'] == 0 and i>0:
            # The trained inverse model weights can then be used to infer a probability density of solutions given new measurements
            _, _, XS, _  = VICI_inverse_model.run(params, y_data_test_h, np.shape(x_data_train)[1], "inverse_model_dir_%s/inverse_model.ckpt" % params['run_label'])

            # Convert XS back to unnormalized version
            if params['do_normscale']:
                for m in range(params['ndim_x']):
                    XS[:,m,:] = XS[:,m,:]*normscales[m]

            # Make KL plot
            plotter.gen_kl_plots(VICI_inverse_model,y_data_test_h,x_data_train,normscales)

            # Make corner plots
            plotter.make_corner_plot(sampler='dynesty1')

            # Make KL plot
            plotter.gen_kl_plots(VICI_inverse_model,y_data_test_h,x_data_train,normscales)

            # Make pp plot
#            plotter.plot_pp(VICI_inverse_model,y_data_train_l,x_data_train,0,normscales)
            

        if i % params['save_interval'] == 0:
             
            save_path = saver.save(session,save_dir)
                
                
    return COST_PLOT, KL_PLOT, train_files
예제 #7
0
def train(params, x_data, y_data_h, y_data_l, save_dir, plotter):
    
    # LOAD DATA
    x_data = x_data[0:np.shape(y_data_h)[0],:]
    y_data_train_h = y_data_h
    y_data_train_l = y_data_l[0:np.shape(y_data_h)[0],:]
    
    # USEFUL SIZES
    xsh = np.shape(x_data)
    ysh_h = np.shape(y_data_train_h)
    ysh_l = np.shape(y_data_train_l)
    
    z_dimension = params['z_dimensions_fw']
    n_weights = params['n_weights_fw']
    
    graph = tf.Graph()
    session = tf.Session(graph=graph)
    with graph.as_default():
        tf.set_random_seed(np.random.randint(0,10))
        
        # PLACE HOLDERS
        x_ph = tf.placeholder(dtype=tf.float32, shape=[None, xsh[1]], name="x_ph")
        yl_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh_l[1]], name="yl_ph")
        yh_ph = tf.placeholder(dtype=tf.float32, shape=[None, ysh_h[1]], name="yl_ph")
        
        # LOAD NEURAL NETWORKS
        DEC_XYlZtoYh = OELBO_decoder_difference.VariationalAutoencoder("OELBO_decoder", ysh_h[1], z_dimension+ysh_l[1]+xsh[1], n_weights) # p(Yh|X,Yl,Z)
        ENC_XYltoZ = OELBO_encoder.VariationalAutoencoder("OELBO_encoder", ysh_l[1]+xsh[1], z_dimension, n_weights) # p(Z|X,Yl)
        ENC_XYhYltoZ = VAE_encoder.VariationalAutoencoder("vae_encoder", xsh[1]+ysh_l[1]+ysh_h[1], z_dimension, n_weights) # q(Z|X,Yl,Yh)
        
        # DEFINE MULTI-FIDELITY OBJECTIVE FUNCTION
        #####################################################################################################################
        SMALL_CONSTANT = 1e-6
        
        # NORMALISE INPUTS
        yl_ph_n = tf_normalise_dataset(yl_ph)
        yh_ph_n = tf_normalise_dataset(yh_ph)
        #yl_ph_n = yl_ph
        #yh_ph_n = yh_ph
#        yh_ph_d = tf_normalise_dataset(yh_ph-y_mu)
        yh_ph_d = yh_ph_n-yl_ph_n
        x_ph_n = tf_normalise_dataset(x_ph)
        #x_ph_n = x_ph
        
        # GET p(Z|X,Yl)
        zxyl_mean,zxyl_log_sig_sq = ENC_XYltoZ._calc_z_mean_and_sigma(tf.concat([x_ph_n,yl_ph_n],1))
        rxyl_samp = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(x_ph_n)[0], z_dimension, zxyl_mean, tf.log(tf.exp(zxyl_log_sig_sq)+SMALL_CONSTANT))
        
        # GET p(Yh|X,Yl,Z) FROM SAMPLES Z ~ p(Z|X,Yl)
        reconstruction_yh = DEC_XYlZtoYh.calc_reconstruction(tf.concat([x_ph_n,yl_ph_n,rxyl_samp],1))
        yh_diff = reconstruction_yh[0]
        yh_mean = yl_ph_n+yh_diff
        yh_log_sig_sq = reconstruction_yh[1]
#        yh_log_sig_sq = -10*tf.ones(tf.shape(reconstruction_yh[1]))
        
        # GET q(Z|X,Yl,Yh)
        zq_mean,zq_log_sig_sq = ENC_XYhYltoZ._calc_z_mean_and_sigma(tf.concat([x_ph_n,yl_ph_n,yh_ph_d],1))
        qzq_samp = ENC_XYhYltoZ._sample_from_gaussian_dist(tf.shape(x_ph_n)[0], z_dimension, zq_mean, tf.log(tf.exp(zq_log_sig_sq)+SMALL_CONSTANT))
        
        # GET p(Yh|X,Yl,Z) FROM SAMPLES Z ~ q(Z|X,Yl,Yh)
        reconstruction_yhq = DEC_XYlZtoYh.calc_reconstruction(tf.concat([x_ph_n,yl_ph_n,qzq_samp],1))
        yh_mean_vae = reconstruction_yhq[0]
        yh_log_sig_sq_vae = reconstruction_yhq[1]
#        yh_log_sig_sq_vae = -10*tf.ones(tf.shape(reconstruction_yh[1]))
        
        # COST FROM RECONSTRUCTION
        normalising_factor_yh_vae = - 0.5 * tf.log(SMALL_CONSTANT+tf.exp(yh_log_sig_sq_vae)) - 0.5 * np.log(2 * np.pi)
        square_diff_between_mu_and_yh_vae = tf.square(yh_mean_vae - yh_ph_d)
        inside_exp_yh_vae = -0.5 * tf.div(square_diff_between_mu_and_yh_vae,SMALL_CONSTANT+tf.exp(yh_log_sig_sq_vae))
        reconstr_loss_yh_vae = -tf.reduce_sum(normalising_factor_yh_vae + inside_exp_yh_vae, 1)
        cost_R_vae = tf.reduce_mean(reconstr_loss_yh_vae)
        
        # KL(q(Z|X,Yl,Yh)||p(Z|X,Yl))
        v_mean = zxyl_mean #2
        aux_mean = zq_mean #1
        v_log_sig_sq = tf.log(tf.exp(zxyl_log_sig_sq)+SMALL_CONSTANT) #2
        aux_log_sig_sq = tf.log(tf.exp(zq_log_sig_sq)+SMALL_CONSTANT) #1
        v_log_sig = tf.log(tf.sqrt(tf.exp(v_log_sig_sq))) #2
        aux_log_sig = tf.log(tf.sqrt(tf.exp(aux_log_sig_sq))) #1
        cost_VAE_a = v_log_sig-aux_log_sig+tf.divide(tf.exp(aux_log_sig_sq)+tf.square(aux_mean-v_mean),2*tf.exp(v_log_sig_sq))-0.5
        cost_VAE_b = tf.reduce_sum(cost_VAE_a,1)
        KL_vae = tf.reduce_mean(cost_VAE_b)
        
        # THE VICI COST FUNCTION
        COST = KL_vae+cost_R_vae
        
        ######################################################################################################################
        
        # TRAIN MULTI-FIDELITY MODEL
        var_list_ELBO = [var for var in tf.trainable_variables() if var.name.startswith("ELBO")]
        optimizer = tf.train.AdamOptimizer(params['initial_training_rate_fw']) 
        minimize = optimizer.minimize(COST,var_list = var_list_ELBO)
        
        # INITIALISE AND RUN SESSION
        init = tf.initialize_all_variables()
        saver = tf.train.Saver()
        session.run(init)
    
    # OPTIMISATION OF MULTI-FIDELITY MODEL
    ##################################################################################################################################################
    COST_PLOT_MF = np.zeros(np.int(np.round(params['num_iterations_fw']/params['report_interval_fw'])+1))
    KL_PLOT_MF = np.zeros(np.int(np.round(params['num_iterations_fw']/params['report_interval_fw'])+1))
    print('Training Multi-Fidelity Model...')
    indices_generator = batch_manager.SequentialIndexer(params['batch_size_fw'], xsh[0])
    nif = -1
    for i in range(params['num_iterations_fw']):
        next_indices = indices_generator.next_indices()
        session.run(minimize, feed_dict={x_ph:x_data[next_indices, :], yl_ph:y_data_train_l[next_indices, :], yh_ph:y_data_train_h[next_indices, :]})
        if i % params['report_interval_fw'] == 0:
            nif = nif+1
            COST_MF, KL_div = session.run([COST,KL_vae], feed_dict={x_ph:x_data[0:100, :], yl_ph:y_data_train_l[0:100, :], yh_ph:y_data_train_h[0:100, :]})
            COST_PLOT_MF[nif] = COST_MF
            KL_PLOT_MF[nif] = KL_div

            if params['print_values']==True:
                print('--------------------------------------------------------------')
                print('Iteration:',i)
                print('Cost Multi-Fidelity Model:',COST_MF)
                print('Multi-Fidelity Model KL Divergence:',KL_div)
    
        if i % params['save_interval_fw'] == 0:
            save_path = saver.save(session,save_dir)

            # Generate forward estimation results
            plotter.plot_y_test(i)
            plotter.plot_y_dist(i)
    
    return COST_PLOT_MF, KL_PLOT_MF