Esempio n. 1
0
def gain_test(data_test, sess, G_sample, X, M):
    data_m_test = 1 - np.isnan(data_test)

    no_test, dim_test = data_test.shape

    norm_data_t, norm_parameters_test = normalization(data_test)
    norm_data_test = np.nan_to_num(norm_data_t, 0)

    # Prepare data format
    Z_mb_test = uniform_sampler(0, 0.01, no_test, dim_test)
    M_mb_test = data_m_test
    X_mb_test = norm_data_test
    X_mb_test = M_mb_test * X_mb_test + (1 - M_mb_test) * Z_mb_test

    # Impute data test
    imputed_data_test = sess.run([G_sample],
                                 feed_dict={
                                     X: X_mb_test,
                                     M: M_mb_test
                                 })[0]
    imputed_data_test = data_m_test * norm_data_test + (
        1 - data_m_test) * imputed_data_test

    # Renormalization
    imputed_data_test = renormalization(imputed_data_test,
                                        norm_parameters_test)

    # Rounding
    imputed_data_test = rounding(imputed_data_test, data_test)

    return imputed_data_test
Esempio n. 2
0
def evaluation_step(generator, data_m, norm_data_x, data_x, ori_data_x,
                    normalizer):
    """
        The validation schema is absent in the original paper implementation
        We will use for convenience the RMSE Value that is used as a Metric
        to perform Early Stopping and monitor the During-Training Performance of the Model
    """
    Z_mb = uniform_sampler(0, 0.01, data_m.shape[0], data_m.shape[1])
    Z_mb = Z_mb.astype('float32')
    M_mb = data_m
    M_mb = M_mb.astype('float32')
    X_mb = norm_data_x
    X_mb = X_mb.astype('float32')
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = generator.predict(
        tf.concat([X_mb.values, M_mb.values], axis=1))[0]
    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = normalizer.denormalize(imputed_data)

    # Rounding
    imputed_data_values = rounding(imputed_data.values, data_x.values)

    rmse = rmse_loss(ori_data_x.values, imputed_data_values, data_m.values)
    imputed_and_rounded_df_to_use_for_downstream_task = pd.DataFrame(
        data=imputed_data_values, columns=imputed_data.columns)
    return rmse, imputed_and_rounded_df_to_use_for_downstream_task
Esempio n. 3
0
def test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index):
    # Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    imputed_data = generator(torch.Tensor(X_mb), torch.Tensor(M_mb)).detach().numpy()
    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data
    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)
    # Rounding
    imputed_data = rounding(imputed_data, data_x)
    rmse, rmse_mean = rmse_loss(ori_data_x[test_index], imputed_data[test_index], data_m[test_index])
    rmse_full, rmse_full_mean = rmse_loss(ori_data_x, imputed_data, data_m)
    print(f'RMSE Performance (mean): {rmse_mean:.4f} (test), {rmse_full_mean:.4f} (full).')
    # print(f'RMSE Performance: {rmse:.4f} (test), {rmse_full:.4f} (full).')
    return rmse
def cph(data_x, cph_parameters, data_image):
    seed = 25
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - parameters: CPH network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = cph_parameters['batch_size']
    hint_rate = cph_parameters['hint_rate']
    alpha = cph_parameters['alpha']
    iterations = cph_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)
    #print(h_dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    #norm_data_x = np.nan_to_num(norm_data, 0)
    norm_data_x = np.nan_to_num(data_x, 0)

    ## CPH architecture
    # Input placeholders
    X_pre = tf.placeholder(tf.float32, shape=[1, 483, dim, 3])
    # Data vector
    #X = tf.placeholder(tf.float32, shape = [None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    conv_filter_w1 = tf.Variable(tf.random_normal([1, 4, 3, 3]))
    conv_filter_b1 = tf.Variable(tf.random_normal([3]))

    conv_filter_w2 = tf.Variable(tf.random_normal([1, 4, 3, 1]))
    conv_filter_b2 = tf.Variable(tf.random_normal([1]))
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [
        G_W1, G_W2, G_W3, G_b1, G_b2, G_b3, conv_filter_w1, conv_filter_b1,
        conv_filter_w2, conv_filter_b2
    ]

    ## CPH functions
    # CNN + Generator
    def generator(x, m):
        relu_feature_maps1 = tf.nn.relu( \
          tf.nn.conv2d(x, conv_filter_w1, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b1)
        max_pool1 = tf.nn.max_pool(relu_feature_maps1,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        relu_feature_maps2 = tf.nn.relu( \
          tf.nn.conv2d(max_pool1, conv_filter_w2, strides=[1, 1, 1, 1], padding='SAME') + conv_filter_b2)
        max_pool2 = tf.nn.max_pool(relu_feature_maps2,
                                   ksize=[1, 1, 4, 1],
                                   strides=[1, 1, 1, 1],
                                   padding='SAME')

        x2 = tf.reshape(max_pool2, [483, dim])

        # Concatenate Mask and Data
        inputs = tf.concat(values=[x2, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## CPH structure
    # Generator
    G_sample = generator(X_pre, M)
    X2 = X_pre[0, :, :, 0]
    # Combine with observed data
    Hat_X = X2 * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## CPH loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X2 - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## CPH solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        #print(len(batch_idx))
        image_mb = data_image[:, batch_idx, :, :]
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)

        H_mb = M_mb * H_mb_temp
        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        image_mb[0, :, :, 0] = X_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X_pre: image_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X_pre: image_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    image_mb = data_image
    image_mb[0, :, :, 0] = X_mb

    imputed_data = sess.run([G_sample], feed_dict={
        X_pre: image_mb,
        M: M_mb
    })[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    #imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Esempio n. 5
0
def gain(data_x, feature_name, onehotencoder, ori_data_dim, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - feature_name: feature namelist of original data
    - onehotencoder: onehotencoder of this data
    - ori_data_dim: dimensions of original data    
    - gain_parameters: GAIN network parameters:
      - data_name: the file name of dataset
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      - onehot: the number of feature for onehot encoder (start from first feature)
      - predict: option for prediction mode
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    data_name = gain_parameters['data_name']
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']
    onehot = gain_parameters['onehot']
    predict = gain_parameters['predict']

    # Model Path
    model_path = 'model/' + data_name

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector q
    X = tf.placeholder(tf.float32, shape=[None, dim], name='X')
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim], name='M')
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim], name='H')

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]),
                       name='D_W1')  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b1')

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='D_W2')
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='D_b2')

    D_W3 = tf.Variable(xavier_init([h_dim, dim]), name='D_W3')
    D_b3 = tf.Variable(tf.zeros(shape=[dim]),
                       name='D_b3')  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1')
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1')

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2')
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2')

    G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3')
    G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3')

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    saver = tf.train.Saver()
    if predict is True and os.path.exists(model_path + '.ckpt.meta'):
        print("Model Restore")
        saver.restore(sess, model_path + '.ckpt')
    else:
        sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})
    if predict is False:
        save_path = saver.save(sess, model_path + '.ckpt')

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    # Reverse encoding
    if onehot > 0:
        imputed_data = reverse_encoding(imputed_data, feature_name,
                                        onehotencoder, onehot, ori_data_dim)

    return imputed_data
Esempio n. 6
0
def gain(data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector
    X = tf.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # Combine with observed data
    Hat_X = X * M + G_sample * (1 - M)

    # Discriminator
    D_prob = discriminator(Hat_X, H)

    ## GAIN loss
    D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                  + (1-M) * tf.log(1. - D_prob + 1e-8))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))

    MSE_loss = \
    tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    for it in tqdm(range(iterations)):

        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = data_m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
        # Sample hint vectors
        H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        H_mb = M_mb * H_mb_temp

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      M: M_mb,
                                      X: X_mb,
                                      H: H_mb
                                  })
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, H: H_mb})

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Esempio n. 7
0
def egain(miss_data_x, gain_parameters):
    #def Egain(miss_data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - miss_data_x: missing data
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    m = 1 - np.isnan(miss_data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    # hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    loss_type = ['trickLogD', 'minimax', 'ls']
    nloss = 3
    beta = 1.0
    ncandi = 1  #1#3
    nbest = 1  #1#3
    nD = 1  # # of discrim updates for each gen update
    # Other parameters
    no, dim = miss_data_x.shape
    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(miss_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    #tf.reset_default_graph()
    tf.compat.v1.get_default_graph()
    # Input placeholders
    # Data vector
    X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf1.placeholder(tf.float32, shape=[None, dim])
    # B vector
    B = tf1.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    # Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure
    # Hint vector
    H = B * M + 0.5 * (1 - B)  # 0.5 => 0.1
    # Generator
    G_sample = generator(X, M)
    D_prob_g = discriminator(X * M + G_sample * (1 - M), H)

    # Combine with observed data
    fake_X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    Hat_X = X * M + fake_X * (1 - M)

    # D loss
    D_prob = discriminator(Hat_X, H)
    D_loss_temp = -tf.reduce_mean(
        (M * tf1.log(D_prob + 1e-8) + (1 - M) * tf1.log(1. - D_prob + 1e-8)))
    D_loss = D_loss_temp
    # Updated parramter
    D_solver = tf1.train.AdamOptimizer(learning_rate=0.002,
                                       beta1=0.5,
                                       beta2=0.99).minimize(D_loss,
                                                            var_list=theta_D)

    # G loss
    #Update loss function
    G_loss_logD = -tf.reduce_mean((1 - M) * 1 / 2 * tf1.log(D_prob_g + 1e-8))
    G_loss_minimax = tf.reduce_mean(
        (1 - M) * 1 / 2 * tf1.log(1. - D_prob_g + 1e-8))
    G_loss_ls = tf1.reduce_mean((1 - M) * tf1.square(D_prob_g - 1))

    MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    G_loss_logD_all = G_loss_logD + alpha * MSE_loss
    G_loss_minimax_all = G_loss_minimax + alpha * MSE_loss
    G_loss_ls_all = G_loss_ls + alpha * MSE_loss

    #Update parramter
    G_solver_logD = tf1.train.AdamOptimizer(learning_rate=0.002,
                                            beta1=0.5,
                                            beta2=0.99).minimize(
                                                G_loss_logD_all,
                                                var_list=theta_G)
    G_solver_minimax = tf1.train.AdamOptimizer(learning_rate=0.002,
                                               beta1=0.5,
                                               beta2=0.99).minimize(
                                                   G_loss_minimax_all,
                                                   var_list=theta_G)
    G_solver_ls = tf1.train.AdamOptimizer(learning_rate=0.002,
                                          beta1=0.5,
                                          beta2=0.99).minimize(
                                              G_loss_ls_all, var_list=theta_G)

    # Fitness function
    Fq_score = tf.reduce_mean((1 - M) * D_prob)
    Fd_score = -tf1.log(
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[0]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[1]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[2]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[3]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[4]))) +
        tf.reduce_sum(tf.square(tf.gradients(D_loss_temp, theta_D[5]))))

    ## Iterations
    sess = tf1.Session()
    # Start Iterations

    gen_new_params = []
    fitness_best = np.zeros(nbest)
    fitness_candi = np.zeros(ncandi)
    # for it in tqdm(range(iterations)):
    for it in tqdm(range(iterations)):
        # Train candidates G
        if it == 0:
            for can_i in range(0, ncandi):
                sess.run(tf1.global_variables_initializer())
                batch_idx = sample_batch_index(no, batch_size)
                X_mb = norm_data_x[batch_idx, :]
                M_mb = m[batch_idx, :]
                Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
                X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
                B_mb = sample_batch_binary(dim, batch_size)
                gen_samples = sess.run([G_sample],
                                       feed_dict={
                                           X: X_mb,
                                           M: M_mb
                                       })[0]
                fq_score, fd_score = sess.run([Fq_score, Fd_score],
                                              feed_dict={
                                                  X: X_mb,
                                                  M: M_mb,
                                                  fake_X: gen_samples,
                                                  B: B_mb
                                              })
                fitness = fq_score + beta * fd_score
                fitness_best[can_i] = fitness
                params = []
                for param in theta_G:
                    params.append(sess.run(param))
                gen_new_params.append(params)
            gen_best_params = copy.deepcopy(gen_new_params)
        else:
            # generate new candidate
            gen_old_params = copy.deepcopy(gen_new_params)
            # print(gen_old_params[0][-1])
            # print(it)
            for can_i in range(ncandi):
                for type_i in range(nloss):
                    batch_idx = sample_batch_index(no, batch_size)
                    X_mb = norm_data_x[batch_idx, :]
                    M_mb = m[batch_idx, :]
                    Z_mb = uniform_sampler(0.0, 0.01, batch_size,
                                           dim)  # update 1.0 ==> 0.01
                    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
                    B_mb = sample_batch_binary(dim, batch_size)
                    # Load and update weights
                    for i in range(len(theta_G)):
                        theta_G[i].load(gen_old_params[can_i][i], sess)
                    loss = loss_type[type_i]
                    if loss == 'trickLogD':
                        sess.run([G_solver_minimax],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })
                    elif loss == 'minimax':
                        sess.run([G_solver_logD],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })
                    elif loss == 'ls':
                        sess.run([G_solver_ls],
                                 feed_dict={
                                     X: X_mb,
                                     M: M_mb,
                                     B: B_mb
                                 })

                    # calculate fitness score
                    gen_samples = sess.run([G_sample],
                                           feed_dict={
                                               X: X_mb,
                                               M: M_mb
                                           })[0]
                    fq_score, fd_score = sess.run([Fq_score, Fd_score],
                                                  feed_dict={
                                                      X: X_mb,
                                                      M: M_mb,
                                                      fake_X: gen_samples,
                                                      B: B_mb
                                                  })
                    fitness = fq_score + beta * fd_score
                    # print(fitness)
                    gap = fitness_best - fitness
                    if min(gap) < 0:
                        idx_replace = np.argmin(gap)
                        params = []
                        for param in theta_G:
                            params.append(sess.run(param))
                        gen_best_params[idx_replace] = params
                        fitness_best[idx_replace] = fitness

                    if can_i * nloss + type_i < ncandi:
                        idx = can_i * nloss + type_i
                        params = []
                        for param in theta_G:
                            params.append(sess.run(param))
                        gen_new_params[idx] = params
                        fitness_candi[idx] = fitness
                    else:
                        gap = fitness_candi - fitness
                        if min(gap) < 0:
                            idx_replace = np.argmin(gap)
                            params = []
                            for param in theta_G:
                                params.append(sess.run(param))
                            gen_new_params[idx_replace] = params
                            fitness_candi[idx_replace] = fitness
        # Train D
        for i in range(nD):
            batch_idx = sample_batch_index(no, batch_size)
            X_mb = norm_data_x[batch_idx, :]
            M_mb = m[batch_idx, :]
            Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)  # 1.0 ==> 0.01
            X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
            B_mb = sample_batch_binary(dim, batch_size)
            # impute data for each candidat
            for can_i in range(ncandi):
                for w in range(len(theta_G)):
                    theta_G[w].load(gen_new_params[can_i][w], sess)
                if can_i == ncandi - 1:
                    gen_samples_cani = sess.run(
                        [G_sample],
                        feed_dict={
                            X: X_mb[can_i * batch_size // ncandi:],
                            M: M_mb[can_i * batch_size // ncandi:]
                        })[0]
                else:
                    gen_samples_cani = sess.run(
                        [G_sample],
                        feed_dict={
                            X:
                            X_mb[can_i * batch_size // ncandi:(can_i + 1) *
                                 batch_size // ncandi],
                            M:
                            M_mb[can_i * batch_size // ncandi:(can_i + 1) *
                                 batch_size // ncandi]
                        })[0]
                # print(gen_samples_cani.shape)
                if can_i == 0:
                    gen_samples = gen_samples_cani
                else:
                    gen_samples = np.append(gen_samples,
                                            gen_samples_cani,
                                            axis=0)
            sess.run([D_solver],
                     feed_dict={
                         X: X_mb,
                         M: M_mb,
                         fake_X: gen_samples,
                         B: B_mb
                     })

    ## Return imputed data
    idx = np.argmax(fitness_best)
    # print(idx)
    for i in range(len(theta_G)):
        theta_G[i].load(gen_best_params[idx][i], sess)

    Z_mb = uniform_sampler(0.0, 0.01, no, dim)
    M_mb = m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
    sess.close()
    imputed_data = m * norm_data_x + (1 - m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, miss_data_x)

    return imputed_data
Esempio n. 8
0
def gain(data_x, gain_parameters):
    '''Impute missing values in data_x

    Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
        - batch_size: Batch size
        - hint_rate: Hint rate
        - alpha: Hyperparameter
        - iterations: Iterations
        
    Returns:
    - imputed_data: imputed data
    '''
    # Define mask matrix
    data_m = (1 - np.isnan(data_x)).astype(float)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    # parameter initialization
    X = tf.convert_to_tensor(norm_data_x)
    X = tf.dtypes.cast(X, tf.float32)
    M = tf.convert_to_tensor(data_m)
    M = tf.dtypes.cast(M, tf.float32)
    X_input = tf.concat(values=[X, M], axis=1)

    ## GAIN architecture
    # Generator
    class Generator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    # Discriminator
    class Discriminator(tf.keras.Model):
        def __init__(self):
            super().__init__()
            self.flatten = layers.Flatten(input_shape=[dim * 2])
            self.dense1 = layers.Dense(h_dim, activation='relu')
            self.dense2 = layers.Dense(h_dim, activation='relu')
            self.dense_output = layers.Dense(dim, activation='sigmoid')
            return

        def call(self, inputs, training=None):
            x = self.flatten(inputs)
            x = self.dense1(x)
            x = self.dense2(x)
            x = self.dense_output(x)
            return x

    ## GAIN loss
    # Generator
    def generator_loss(generator, discriminator, x, m):
        generator.trainable = True
        discriminator.trainable = False
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        MSE_loss = tf.reduce_mean(
            (m * x - m * G_sample)**2) / tf.reduce_mean(m)
        D_input = tf.concat(values=[G_sample, m], axis=1)
        D_prob = discriminator(D_input)
        G_loss_tmp = -tf.reduce_mean((1 - m) * tf.math.log(D_prob + 1e-8))
        return G_loss_tmp + alpha * MSE_loss

    # Discriminator
    def discriminator_loss(generator, discriminator, x, m, h):
        generator.trainable = False
        discriminator.trainable = True
        G_input = tf.concat(values=[x, m], axis=1)
        G_sample = generator(G_input)
        x_hat = x * m + G_sample * (1 - m)
        D_input = tf.concat(values=[x_hat, h], axis=1)
        D_prob = discriminator(D_input)
        return -tf.reduce_mean(m * tf.math.log(D_prob + 1e-8) \
                + (1-m) * tf.math.log(1. - D_prob + 1e-8))

    # Build
    generator = Generator()
    generator.build(input_shape=(None, 2 * dim))
    g_optimizer = tf.keras.optimizers.Adam()
    discriminator = Discriminator()
    discriminator.build(input_shape=(None, 2 * dim))
    d_optimizer = tf.keras.optimizers.Adam()

    # Training
    one_tensor = tf.constant(1., shape=(batch_size, dim), dtype=float)

    for _ in tqdm(range(iterations)):
        # Sample batch
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = tf.gather(X, batch_idx)
        M_mb = tf.gather(M, batch_idx)
        Z_mb = tf.convert_to_tensor(uniform_sampler(0, 0.01, batch_size, dim),
                                    dtype=float)
        H_mb_tmp = tf.convert_to_tensor(binary_sampler(hint_rate, batch_size,
                                                       dim),
                                        dtype=float)
        H_mb = tf.math.multiply(M_mb, H_mb_tmp)

        # Combine random vectors with observed vectors
        # X_mb = M_mb * X_mb + (1-M_mb) * Z_mb
        X_mb = tf.math.add(tf.math.multiply(M_mb, X_mb), \
                tf.math.multiply(tf.math.subtract(one_tensor, M_mb), Z_mb))

        # training Discriminator
        with tf.GradientTape() as tape:
            d_loss = discriminator_loss(generator, discriminator, X_mb, M_mb,
                                        H_mb)
        grads = tape.gradient(d_loss, discriminator.trainable_variables)
        d_optimizer.apply_gradients(
            zip(grads, discriminator.trainable_variables))

        # training Generator
        with tf.GradientTape() as tape:
            g_loss = generator_loss(generator, discriminator, X_mb, M_mb)
        grads = tape.gradient(g_loss, generator.trainable_variables)
        g_optimizer.apply_gradients(zip(grads, generator.trainable_variables))

    ## Return imputed data
    imputed_data = np.array([]).reshape(0, dim)
    train_data = tf.data.Dataset.from_tensor_slices(X_input).batch(batch_size)
    train_data_iter = iter(train_data)
    while True:
        try:
            batch = next(train_data_iter)
        except StopIteration:
            break
        X_tmp = generator(batch).numpy()
        imputed_data = np.vstack([imputed_data, X_tmp])

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Recovery
    imputed_data = data_m * np.nan_to_num(data_x) + (1 - data_m) * imputed_data

    # Rounding
    imputed_data = rounding(imputed_data, data_x)

    return imputed_data
Esempio n. 9
0
def gain(data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - data_x: original data with missing values
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - hint_rate: Hint rate
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    data_m = 1 - np.isnan(data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']
    checkpoint_dir = gain_parameters['checkpoint_dir']
    data_name = gain_parameters['data_name']

    # Other parameters
    no, dim = data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    # Input placeholders
    # Data vector
    X = tf.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    H = tf.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]), name='G_W1')
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b1')

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]), name='G_W2')
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]), name='G_b2')

    G_W3 = tf.Variable(xavier_init([h_dim, dim]), name='G_W3')
    G_b3 = tf.Variable(tf.zeros(shape=[dim]), name='G_b3')

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    # save models
    def save_model(sess, checkpoint_dir):
        model_name = "gain_model"
        model_dir = "%s" % (data_name)
        checkpoint_dir = os.path.join(checkpoint_dir, model_dir)

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        saver.save(sess, os.path.join(checkpoint_dir, model_name))

    # ## GAIN structure
    # Generator
    G_sample = generator(X, M)

    # # Combine with observed data
    # Hat_X = X * M + G_sample * (1-M)

    # # Discriminator
    # D_prob = discriminator(Hat_X, H)

    # ## GAIN loss
    # D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
    #                               + (1-M) * tf.log(1. - D_prob + 1e-8))

    # G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))

    # MSE_loss = \
    # tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    # D_loss = D_loss_temp
    # G_loss = G_loss_temp + alpha * MSE_loss

    # ## GAIN solver
    # D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    # G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    saver = tf.train.Saver(max_to_keep=1)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Start Iterations
    # for it in tqdm(range(iterations)):

    #   # Sample batch
    #   batch_idx = sample_batch_index(no, batch_size)
    #   X_mb = norm_data_x[batch_idx, :]
    #   M_mb = data_m[batch_idx, :]
    #   # Sample random vectors
    #   Z_mb = uniform_sampler(0, 0.01, batch_size, dim)
    #   # Sample hint vectors
    #   H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
    #   H_mb = M_mb * H_mb_temp

    #   # Combine random vectors with observed vectors
    #   X_mb = M_mb * X_mb + (1-M_mb) * Z_mb

    #   _, D_loss_curr = sess.run([D_solver, D_loss_temp],
    #                             feed_dict = {M: M_mb, X: X_mb, H: H_mb})
    #   _, G_loss_curr, MSE_loss_curr = \
    #   sess.run([G_solver, G_loss_temp, MSE_loss],
    #            feed_dict = {X: X_mb, M: M_mb, H: H_mb})
    # save_model(sess, checkpoint_dir)
    print('testing mode')
    # resore the model
    # G_sample = load(sess, checkpoint_dir)
    print(" [*] Reading checkpoint...")

    # model_dir = "%s" % (data_name)
    # checkpoint_dir = os.path.join(checkpoint_dir, model_dir)

    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
        print('The model loaded successfully')
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name))
        # print(sess.run(G_b1))
        G_W1 = sess.run(G_W1)
        G_b1 = sess.run(G_b1)

        G_W2 = sess.run(G_W2)
        G_b2 = sess.run(G_b2)

        G_W3 = sess.run(G_W3)
        G_b3 = sess.run(G_b3)
    else:
        print('failed to load the model, check model path')

    ## Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]

    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, data_x)
    return imputed_data
Esempio n. 10
0
def gain(miss_data_x, gain_parameters):
    '''Impute missing values in data_x
  
  Args:
    - miss_data_x: missing data
    - gain_parameters: GAIN network parameters:
      - batch_size: Batch size
      - alpha: Hyperparameter
      - iterations: Iterations
      
  Returns:
    - imputed_data: imputed data
  '''
    # Define mask matrix
    m = 1 - np.isnan(miss_data_x)

    # System parameters
    batch_size = gain_parameters['batch_size']
    # hint_rate = gain_parameters['hint_rate']
    alpha = gain_parameters['alpha']
    iterations = gain_parameters['iterations']

    # Other parameters
    no, dim = miss_data_x.shape

    # Hidden state dimensions
    h_dim = int(dim)

    # Normalization
    norm_data, norm_parameters = normalization(miss_data_x)
    norm_data_x = np.nan_to_num(norm_data, 0)

    ## GAIN architecture
    tf1.reset_default_graph()
    # Input placeholders
    # Data vector
    X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Mask vector
    M = tf1.placeholder(tf.float32, shape=[None, dim])
    # # Hint vector
    # H = tf.placeholder(tf.float32, shape = [None, dim])
    # B vector
    B = tf1.placeholder(tf.float32, shape=[None, dim])

    # Discriminator variables
    D_W1 = tf1.Variable(xavier_init([dim * 2, h_dim]))  # Data + Hint as inputs
    D_b1 = tf1.Variable(tf.zeros(shape=[h_dim]))

    D_W2 = tf1.Variable(xavier_init([h_dim, h_dim]))
    D_b2 = tf1.Variable(tf.zeros(shape=[h_dim]))

    D_W3 = tf1.Variable(xavier_init([h_dim, dim]))
    D_b3 = tf1.Variable(tf.zeros(shape=[dim]))  # Multi-variate outputs

    theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]

    #Generator variables
    # Data + Mask as inputs (Random noise is in missing components)
    G_W1 = tf.Variable(xavier_init([dim * 2, h_dim]))
    G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
    G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))

    G_W3 = tf.Variable(xavier_init([h_dim, dim]))
    G_b3 = tf.Variable(tf.zeros(shape=[dim]))

    theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]

    ## GAIN functions
    # Generator
    def generator(x, m):
        # Concatenate Mask and Data
        inputs = tf.concat(values=[x, m], axis=1)
        G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
        G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
        # MinMax normalized output
        G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
        return G_prob

    # Discriminator
    def discriminator(x, h):
        # Concatenate Data and Hint
        inputs = tf.concat(values=[x, h], axis=1)
        D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
        D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
        D_logit = tf.matmul(D_h2, D_W3) + D_b3
        D_prob = tf.nn.sigmoid(D_logit)
        return D_prob

    ## GAIN structure

    # Generator
    G_sample = generator(X, M)
    H = B * M + 0.5 * (1 - B)
    D_prob_g = discriminator(X * M + G_sample * (1 - M), H)

    fake_X = tf1.placeholder(tf.float32, shape=[None, dim])
    # Hint vector
    Hat_X = X * M + fake_X * (1 - M)
    # Discriminator
    D_prob = discriminator(Hat_X, H)

    # GAIN loss
    # D_loss_temp = -tf.reduce_mean((1-B)*(M * tf.log(D_prob + 1e-8) \
    #                               + (1-M) * tf.log(1. - D_prob + 1e-8))) \
    #                               / tf.reduce_mean(1-B)
    #
    # G_loss_temp = -tf.reduce_mean((1-B)*(1-M) * tf.log(D_prob + 1e-8)) / tf.reduce_mean(1-B)
    D_loss_temp = -tf.reduce_mean((M * tf1.log(D_prob + 1e-8) \
                                  + (1-M) * tf1.log(1. - D_prob + 1e-8)))

    G_loss_temp = -tf.reduce_mean((1 - M) * tf1.log(D_prob_g + 1e-8))
    MSE_loss = tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)

    D_loss = D_loss_temp
    G_loss = G_loss_temp + alpha * MSE_loss

    ## GAIN solver
    D_solver = tf1.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf1.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

    ## Iterations
    sess = tf1.Session()
    sess.run(tf1.global_variables_initializer())
    gen_new_params = []
    params = []
    for param in theta_G:
        params.append(sess.run(param))
    gen_new_params.append(params)

    for it in range(iterations):
        # for it in tqdm(range(iterations)):
        # Sample batch
        # print(sess.run(theta_G[-1]))
        gen_old_params = copy.deepcopy(gen_new_params)
        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
        # Sample hint vectors
        # H_mb_temp = binary_sampler(0.9, batch_size, dim)
        # H_mb = M_mb * H_mb_temp
        # H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        B_mb = sample_batch_binary(dim, batch_size)
        # H_mb = B_mb*M_mb + 0.5*(1-B_mb)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        f_mb = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
        # print(f_mb)
        for w in range(len(theta_G)):
            theta_G[w].load(gen_new_params[0][w], sess)
        _, D_loss_curr = sess.run([D_solver, D_loss_temp],
                                  feed_dict={
                                      X: X_mb,
                                      M: M_mb,
                                      fake_X: f_mb,
                                      B: B_mb
                                  })

        batch_idx = sample_batch_index(no, batch_size)
        X_mb = norm_data_x[batch_idx, :]
        M_mb = m[batch_idx, :]
        # Sample random vectors
        Z_mb = uniform_sampler(0.0, 0.01, batch_size, dim)
        # Sample hint vectors
        # H_mb_temp = binary_sampler(0.9, batch_size, dim)
        # H_mb = M_mb * H_mb_temp
        # H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
        B_mb = sample_batch_binary(dim, batch_size)
        # H_mb = B_mb*M_mb + 0.5*(1-B_mb)

        # Combine random vectors with observed vectors
        X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
        for w in range(len(theta_G)):
            theta_G[w].load(gen_old_params[0][w], sess)
        _, G_loss_curr, MSE_loss_curr = \
        sess.run([G_solver, G_loss_temp, MSE_loss],
                 feed_dict = {X: X_mb, M: M_mb, B: B_mb})
        params = []
        for param in theta_G:
            params.append(sess.run(param))
        gen_new_params[0] = params
    ## Return imputed data
    Z_mb = uniform_sampler(0.0, 0.01, no, dim)
    M_mb = m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    for w in range(len(theta_G)):
        theta_G[w].load(gen_new_params[0][w], sess)
    imputed_data = sess.run([G_sample], feed_dict={X: X_mb, M: M_mb})[0]
    sess.close()
    imputed_data = m * norm_data_x + (1 - m) * imputed_data

    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)

    # Rounding
    imputed_data = rounding(imputed_data, miss_data_x)

    return imputed_data