Example #1
0
def main(args, yy):
    '''Main function
  
  Args:
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m, ward_nor_list = data_loader(miss_rate, yy)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    RMSE, MAE = test_loss(ori_data_x, imputed_data_x, ward_nor_list)

    return RMSE, MAE
Example #2
0
def main(args):
    '''
  Args:
    - data_name: LossSight
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''
    data_name = args.data_name
    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    file_name = 'data/' + data_name + '.csv'
    miss_data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Save Result
    np.savetxt("result.csv", imputed_data_x, delimiter=',')

    return imputed_data_x
def main(args):
    '''Main function for UCI letter and spam datasets.

    Args:
      - data_name: letter or spam
      - miss_rate: probability of missing components
      - batch:size: batch size
      - hint_rate: hint rate
      - alpha: hyperparameter
      - iterations: iterations

    Returns:
      - imputed_data_x: imputed data
      - rmse: Root Mean Squared Error
    '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate,
                                                  args.mechanism)

    # Impute missing data
    if args.kfold:
        rmse_list = []
        for i, (train_index, test_index) in enumerate(
                KFold(shuffle=True, random_state=1).split(ori_data_x)):
            rmse = gain(miss_data_x, gain_parameters, ori_data_x, train_index,
                        test_index, args.mechanism)
            rmse_list.append(rmse)
        print(np.mean(rmse_list), np.std(rmse_list))
    else:
        train_index = test_index = range(len(ori_data_x))
        gain(miss_data_x, gain_parameters, ori_data_x, train_index, test_index,
             args.mechanism)
Example #4
0
 def test_001_t(self):
     src_data = (0, 1, -2, 5.5, -0.5)
     expected_result = (0, 2, -4, 11, -1)
     src = blocks.vector_source_f(src_data)
     gain = gain(2)
     snk = blocks.vector_sink_f()
     self.tb.connect(src, gain)
     self.tb.connect(gain, snk)
     self.tb.run()
     result_data = snk.data()
     self.assertFloatTuplesAlmostEqual(expected_result, result_data, 6)
     # set up fg
     self.tb.run()
Example #5
0
File: main.py Project: kiw9761/GAIN
def main(args):
    '''Main function
  
  Args:
    - data_name: the file name of dataset
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    - onehot: the number of feature for onehot encoder (start from first feature)
    - predict: option for prediction mode, no ramdom mask and save model if on
    
  Returns:
    - imputed_data_x: imputed data
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'data_name': args.data_name,
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations,
        'onehot': args.onehot,
        'predict': args.predict
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m, feature_name, onehotencoder, ori_data_dim = data_loader(
        data_name, miss_rate, args.onehot, args.predict)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, feature_name, onehotencoder,
                          ori_data_dim, gain_parameters)

    # Save imputed data
    pd.DataFrame(imputed_data_x, columns=feature_name).to_csv(
        'data_imputed/' + data_name + "_imputed.csv", index=False, header=True)

    return imputed_data_x
Example #6
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
    mae = mae_loss(ori_data_x, imputed_data_x, data_m)

    print()
    print('RMSE Performance: ' + str(np.round(rmse, 4)))
    print('MAE Performance: ' + str(np.round(mae, 4)))

    return imputed_data_x, rmse
Example #7
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_path = args.data_path
    output_path = args.output_path

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and replace missingness to nan
    miss_data_x = data_replace(data_path)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    np.savetxt(output_path, imputed_data_x, delimiter=',')

    # # Report the RMSE performance
    # rmse = rmse_loss (ori_data_x, imputed_data_x, data_m)

    # print()
    # print('RMSE Performance: ' + str(np.round(rmse, 4)))
    return imputed_data_x
Example #8
0
def main():
    data_names = ['letter', 'spam']
    # data_names = ['breasttissue','glass', 'thyroid']
    # data with continuous feature and not originally missing

    #data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel',
    #              'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst',
    #              'thyroid','vehicle','vertebral','wine','yeast']
    print(len(data_names))
    miss_rate = 0.2
    batch_size = 64
    alpha = 100
    iterations = 1000
    n_times = 3
    wb = xlwt.Workbook()
    sh_rmse = wb.add_sheet("GAIN_rmse")
    # sh_acc = wb.add_sheet("EGAIN_acc")
    sh_acc_dct = wb.add_sheet("GAIN_acc_dct")
    sh_acc_knn = wb.add_sheet("GAIN_acc_knn")
    sh_acc_nb = wb.add_sheet("GAIN_acc_nb")
    sh_acc_lr = wb.add_sheet("GAIN_acc_lr")

    for k in range(len(data_names)):

        data_name = data_names[k]
        gain_parameters = {
            'batch_size': batch_size,
            'alpha': alpha,
            'iterations': iterations
        }
        print("Dataset: ", data_name)
        rmse = []
        # acc_dct = []
        # acc_knn = []
        # acc_nb = []
        ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate)
        sh_rmse.write(0, k, data_name)
        sh_acc_dct.write(0, k, data_name)
        sh_acc_knn.write(0, k, data_name)
        sh_acc_nb.write(0, k, data_name)
        sh_acc_lr.write(0, k, data_name)
        # sh_acc.write(0, 0, 'dct')
        # sh_acc.write(0, 1, 'knn')
        # sh_acc.write(0, 2, 'nb')
        for i in range(n_times):

            # Impute missing data
            imputed_data_x = gain(miss_data_x, gain_parameters)
            imputed_data_x, _ = normalization(imputed_data_x)

            # Calculate rmse
            rmse.append(rmse_loss(ori_data_x, imputed_data_x, m))

            print('{:2d}/{:2d}'.format(i + 1, n_times), end=':')
            print('RMSE = ' + str(np.round(rmse[-1], 4)))
            sh_rmse.write(i + 1, k, str(np.round(rmse[-1], 4)))
            if data_name in ['letter', 'spam']:
                continue
            scf = StratifiedShuffleSplit(n_splits=10)
            score_dct = cross_val_score(DecisionTreeClassifier(),
                                        imputed_data_x,
                                        y,
                                        cv=scf,
                                        scoring='accuracy')
            print(score_dct)
            # acc_dct.extend(score_dct)
            sh_acc_dct.write(i + 1, k, str(np.round(np.mean(score_dct), 4)))
            # for j in range(len(score_dct)):
            # sh_acc.write(i * 5 + j + 1, 0, str(np.round(score_dct[j], 4)))
            score_knn = cross_val_score(KNeighborsClassifier(),
                                        imputed_data_x,
                                        y,
                                        cv=scf,
                                        scoring='accuracy')
            print(score_knn)
            # acc_knn.extend(score_knn)
            sh_acc_knn.write(i + 1, k, str(np.round(np.mean(score_knn), 4)))
            # for j in range(len(score_knn)):
            # sh_acc.write(i * 5 + j + 1, 1, str(np.round(score_knn[j], 4)))
            score_nb = cross_val_score(GaussianNB(),
                                       imputed_data_x,
                                       y,
                                       cv=scf,
                                       scoring='accuracy')
            print(score_nb)
            # acc_nb.extend(score_nb)
            sh_acc_nb.write(i + 1, k, str(np.round(np.mean(score_nb), 4)))
            # for j in range(len(score_nb)):
            # sh_acc.write(i * 5 + j + 1, 2, str(np.round(score_nb[j], 4)))
            score_lr = cross_val_score(LogisticRegression(max_iter=1000),
                                       imputed_data_x,
                                       y,
                                       cv=scf,
                                       scoring='accuracy')
            print(score_lr)
            # acc_nb.extend(score_nb)
            sh_acc_lr.write(i + 1, k, str(np.round(np.mean(score_lr), 4)))
        # rmse = np.array(rmse)
        # acc_dct = np.array(acc_dct)
        # acc_knn = np.array(acc_knn)
        # acc_nb = np.array(acc_nb)
        # print("RMSE mean = {:.4f}; variance = {:.4f} ".format(np.mean(rmse), np.std(rmse)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_dct), np.std(acc_dct)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_knn), np.std(acc_knn)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_nb), np.std(acc_nb)))
        print("---------------------------")
    wb.save('GAIN_results_15.xls')
Example #9
0
    plt.tight_layout(rect=[0, 0.03, 0, 1.25])
    plt.subplots_adjust(hspace=1, wspace=0.35)
    plt.show()

X = np.asarray(all_values_for_variables).astype(np.float).transpose()

gain_parameters = {
    'batch_size': 128,
    'hint_rate': 0.9,
    'alpha': 1000,
    'iterations': 1000
}

#imputer = KNNImputer(n_neighbors=5)
#imputed_variable_values = imputer.fit_transform(X).transpose()
imputed_variable_values = gain(X, gain_parameters).transpose()

df = pd.DataFrame(imputed_variable_values)
patients_labels = []
patients_features = []

PLT_ALL = True

for i in range(0, len(selected_patient_ids)):

    patient_id = selected_patient_ids[i]
    patient_features = []

    for k in range(0, len(selected_variables)):
        variable_name = selected_variables[k]
        imputed_values_for_variable = df.values[k, :]
def main(iterations=NUM_ITERATIONS,
         batch_size=128,
         hint_rate=0.5,
         miss_rate=0.3):

    gain_parameters = {
        'batch_size': batch_size,
        'hint_rate': hint_rate,
        'iterations': iterations
    }

    enable_transform = False
    remove_outliers = False
    n_time_points = 3

    data_x = pickle.load(open('./missing_data.sav', 'rb'))
    data_x = data_x.transpose().astype(np.float)[:, :]

    # Remove variables with more
    no, dim = data_x.shape
    removed = 0
    for d in range(0, dim):
        if variables[d - removed] in remove_variables:
            variables.remove(variables[d - removed])
            data_x = np.delete(data_x, d - removed, axis=1)
            removed += 1

    no, dim = data_x.shape

    if len(variables) != dim:
        print(len(variables), dim)
        print('Incompatible dimensions.')
        exit()

    no_total = no * dim
    no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True)
    no_not_nan = no_total - no_nan
    n_patients = int(no / n_time_points)

    miss_data_x = np.copy(data_x)

    print('Input shape', no, 'x', dim)
    print('NAN values:', no_nan, '/', no_total, \
      '%2.f%%' % (no_nan / no_total * 100))

    # Introduce missing data
    data_m = binary_sampler(1 - miss_rate, no, dim)
    miss_data_x[data_m == 0] = np.nan

    transformer = RobustScaler()
    miss_data_x = transformer.fit_transform(miss_data_x)

    no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True)
    no_not_nan = no_total - no_nan

    print('After removal, NAN values:', no_nan, '/', no_total, \
      '%2.f%%' % (no_nan / no_total * 100))

    real_miss_rate = (no_nan / no_total * 100)

    miss_data_x_gan_tmp = np.zeros((n_patients, dim * n_time_points))

    # Swap (one row per time point) to (one column per time point)
    for i in range(0, n_patients):
        for j in range(0, dim):
            for n in range(0, n_time_points):
                miss_data_x_gan_tmp[i, n * dim +
                                    j] = miss_data_x[i * n_time_points + n, j]

    imputed_data_x_gan_tmp = gain(miss_data_x_gan_tmp, gain_parameters)

    imputed_data_x_gan = np.copy(miss_data_x)

    ## Swap (one column per time point) to (one row per time point)
    for i in range(0, n_patients):
        for j in range(0, dim):
            for n in range(0, n_time_points):
                imputed_data_x_gan[i * n_time_points + n,
                                   j] = imputed_data_x_gan_tmp[i, n * dim + j]

    imputer = KNNImputer(n_neighbors=5)
    imputed_data_x_knn = imputer.fit_transform(miss_data_x)

    imputer = IterativeImputer(verbose=True)
    imputed_data_x_mice = imputer.fit_transform(miss_data_x)

    imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan)
    imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn)
    imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice)

    # Save imputed data to disk
    pickle.dump(imputed_data_x_gan, open('./filled_data.sav', 'wb'))

    # Get residuals for computation of stats
    distances_gan = np.zeros((dim, n_time_points * n_patients))
    distances_knn = np.zeros((dim, n_time_points * n_patients))
    distances_mice = np.zeros((dim, n_time_points * n_patients))
    distributions = {'deleted': [], 'gan': [], 'knn': [], 'mice': []}

    from scipy.stats import iqr

    for j in range(0, dim):

        nn_values = data_x[:, j].flatten()
        nn_values = nn_values[~np.isnan(nn_values)]

        dim_iqr = np.mean(nn_values)  # iqr(nn_values)

        for i in range(0, n_patients):
            variable_name = variables[j]
            i_start = int(i * n_time_points)
            i_stop = int(i * n_time_points + n_time_points)

            original_tuple = data_x[i_start:i_stop, j]
            corrupted_tuple = miss_data_x[i_start:i_stop, j]
            imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop, j]
            imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop, j]
            imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop, j]

            #if i == 1 or i == 2:
            #  print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn)

            for k in range(0, n_time_points):
                a, b, c, d = original_tuple[k], imputed_tuple_gan[k], \
                             imputed_tuple_knn[k], imputed_tuple_mice[k]
                if np.isnan(a) or data_m[i_start + k, j] != 0: continue
                #if i % 10 == 0: print(variable_name, a,b,c,d, b-a)
                distances_gan[j, i * k] = (b - a)
                distances_knn[j, i * k] = (c - a)
                distances_mice[j, i * k] = (d - a)

    # Compute distance statistics
    all_stats = {}

    for j in range(0, dim):

        print('%d. Imputed variable: %s' % (j, variables[j]))

        current_stats = {'gan': {}, 'knn': {}, 'mice': {}}  # make a copy

        # Stats for original data
        dim_mean = np.mean([x for x in data_x[:, j] if not np.isnan(x)])
        dim_max = np.max([x for x in data_x[:, j] if not np.isnan(x)])
        dim_iqr = iqr([x for x in data_x[:, j] if not np.isnan(x)])

        # Indices for removed data
        ind = (data_m[:, j]
               == 0).flatten() & (~np.isnan(data_x[:, j])).flatten()

        # Stats for GAN
        current_stats['gan']['bias'] = np.mean(distances_gan[j])
        current_stats['gan']['rmse'] = np.sqrt(np.mean(distances_gan[j]**2))
        current_stats['gan']['nrmse'] = current_stats['gan']['rmse'] / dim_iqr
        current_stats['gan']['mape'] = np.mean(np.abs(distances_gan[j]))
        current_stats['gan']['wd'] = wasserstein_distance(
            data_x[ind, j].flatten(), imputed_data_x_gan[ind, j].flatten())

        # Stats for KNN
        current_stats['knn']['bias'] = np.mean(distances_knn[j])
        current_stats['knn']['rmse'] = np.sqrt(np.mean(distances_knn[j]**2))
        current_stats['knn']['nrmse'] = current_stats['knn']['rmse'] / dim_iqr
        current_stats['knn']['mape'] = np.mean(np.abs(distances_knn[j]))
        current_stats['knn']['wd'] = wasserstein_distance(
            data_x[ind, j].flatten(), imputed_data_x_knn[ind, j].flatten())

        # Stats for MICE
        current_stats['mice']['bias'] = np.mean(distances_mice[j])
        current_stats['mice']['rmse'] = np.sqrt(np.mean(distances_mice[j]**2))
        current_stats['mice'][
            'nrmse'] = current_stats['mice']['rmse'] / dim_iqr
        current_stats['mice']['mape'] = np.mean(np.abs(distances_mice[j]))
        current_stats['mice']['wd'] = wasserstein_distance(
            data_x[ind, j].flatten(), imputed_data_x_mice[ind, j].flatten())

        for model_name in current_stats:
            model = current_stats[model_name]
            print('... %s - bias: %.3f, RMSE: %.3f, ME: %.3f, WD: %.3f' % \
              (model_name, model['bias'], model['rmse'], model['mape'], model['wd']))

        all_stats[variables[j]] = dict(current_stats)

        print()

    n_fig_rows, n_fig_cols = 6, 6
    n_fig_total = n_fig_rows * n_fig_cols

    if dim > n_fig_total: print('Warning: not all variables plotted')

    all_fig_axes = [
        plt.subplots(n_fig_rows, n_fig_cols, figsize=(15, 15))
        for _ in range(0, 3)
    ]

    for j in range(0, dim):

        dim_not_nan = np.count_nonzero(~np.isnan(data_x[:, j]))
        deleted_no = np.count_nonzero(
            np.isnan(miss_data_x[:, j]) & ~np.isnan(data_x[:, j]))
        ax_title = variables[j] + (' (%d of %d observed)' %
                                   (deleted_no, dim_not_nan))

        dim_axes = [
            fig_axes[1][int(j / n_fig_cols), j % n_fig_cols]
            for fig_axes in all_fig_axes
        ]

        [
            ax.set_title(ax_title,
                         fontdict={
                             'fontsize': 7,
                             'fontweight': 'bold'
                         }) for ax in dim_axes
        ]

        input_arrays = [
            data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice
        ]

        output_arrays = [
          np.asarray([input_arr[ii,j] for ii in range(0, no) if \
            (not np.isnan(data_x[ii,j]) and \
            data_m[ii,j] == 0)]) for input_arr in input_arrays
        ]

        deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays

        plot_distribution_densities(output_arrays, all_stats, variables[j],
                                    dim_axes[0])
        plot_distribution_residuals(output_arrays, dim_axes[1])
        plot_distribution_summaries(output_arrays, dim_axes[2])

        # Make QQ plot of original and deleted values vs. normal distribution
        #dist_max = np.max(np.concatenate([imputed_values_gan, deleted_values]))
        #qqplot_1sample((data_x[~np.isnan(data_x[:,j]),j] - dist_min) / dist_max, ax=ax3, color='b')
        #qqplot_1sample((data_x[data_m[:,j] == 0,j] - dist_min) / dist_max, ax=ax3, color='r',draw_line=False)

    # Figure 1
    fig1 = all_fig_axes[0][0]
    top_title = 'Kernel density estimation for erased and predicted values, for each imputation method'
    fig1.suptitle(top_title, fontsize=8)

    fig1.tight_layout(rect=[0, 0.03, 0, 1.25])
    fig1.subplots_adjust(hspace=1, wspace=0.35)

    # Figure 2
    fig2 = all_fig_axes[1][0]
    top_title = 'Q-Q plot of erased vs. imputed values, for each imputation method'
    fig2.suptitle(top_title, fontsize=8)

    fig2.tight_layout(rect=[0, 0.03, 0, 1.25])
    fig2.subplots_adjust(hspace=1, wspace=0.35)

    # Figure 3
    fig3 = all_fig_axes[2][0]
    top_title = 'Bayesian confidence intervals for the mean and standard deviation, for erased values and imputed values'
    fig3.suptitle(top_title, fontsize=8)

    fig3.tight_layout(rect=[0, 0.03, 0, 1.25])
    fig3.subplots_adjust(hspace=1, wspace=0.35)

    # Figure 4
    fig5, ax5 = plt.subplots(1, 1)
    top_title = 'Distribution of normalized RMSEs for each imputation method'
    fig5.suptitle(top_title, fontsize=8)
    plot_error_distributions(all_stats, fig5, ax5)
    ax5.set_ylabel('Probability density', fontsize=6)
    ax5.set_xlabel('NRMSE (normalized to IQR)', fontsize=6)
    ax5.legend(fontsize=6)
    fig5.tight_layout(rect=[0, 0.03, 0, 1.25])
    fig5.subplots_adjust(hspace=1, wspace=0.35)

    plt.show()

    for model_name in ['gan', 'knn', 'mice']:
        wds = [
            all_stats[variable_name][model_name]['wd']
            for variable_name in all_stats
        ]
        nrmses = [
            all_stats[variable_name][model_name]['nrmse']
            for variable_name in all_stats
        ]
        mwd = np.round(np.asarray(wds).mean(), 2)
        mnrmse = np.round(np.asarray(nrmses).mean(), 2)
        print('Model: %s - average WD = %.2f, average NRMSE = %.2f ' %
              (model_name, mwd, mnrmse))

    return all_stats
Example #11
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m, data_y = data_loader(data_name, miss_rate)

    imputed_data_x = gain(miss_data_x, gain_parameters)

    #pd.DataFrame(data_y, imputed_data_x, axis = 1)

    # Step- craete data_m using testdata
    # Step - combine train and missing_test_data
    # Step - retrun total missing and original data_m
    # Step - while calculating RMSE
    # use original as test_original
    # fetch testing imputed datset 934 to last
    # data_m as missing_test_data

    if data_name == 'vals_test_df':
        imputed_data_x = imputed_data_x[range(918, 1311), :]
    elif data_name == 'vals_test_df_test_type1':
        imputed_data_x = imputed_data_x[range(495, 1311), :]
    elif data_name == 'vals_test_df_test_type2':
        imputed_data_x = imputed_data_x[range(816, 1311), :]
    else:
        imputed_data_x = imputed_data_x

    imputed_data_x_df = pd.DataFrame(imputed_data_x)
    data_y_df = pd.DataFrame(data_y)
    imputed_data_df = pd.concat([data_y_df, imputed_data_x_df],
                                ignore_index=True,
                                axis=1)
    imputed_data_df.to_csv("GAN_imputated_catalogueData1.csv", index=False)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)

    print()
    print('RMSE Performance: ' + str(np.round(rmse, 4)))

    return imputed_data_x, rmse
Example #12
0
def main():
    data_names = ['letter', 'spam']
    # data_names = ['breasttissue','glass', 'thyroid']
    # data with continuous feature and not originally missing

    # data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel',
    #               'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst',
    #               'thyroid','vehicle','vertebral','wine','yeast']
    print(len(data_names))
    miss_rate = 0.2
    batch_size = 64
    alpha = 100
    iterations = 1000
    n_times = 30

    wb_gain = xlwt.Workbook()
    sh_rmse_gain = wb_gain.add_sheet("GAIN_rmse")
    sh_acc_dct_gain = wb_gain.add_sheet("GAIN_acc_dct")
    sh_acc_knn_gain = wb_gain.add_sheet("GAIN_acc_knn")
    sh_acc_nb_gain = wb_gain.add_sheet("GAIN_acc_nb")
    sh_acc_lr_gain = wb_gain.add_sheet("GAIN_acc_lr")

    wb_egain = xlwt.Workbook()
    sh_rmse_egain = wb_egain.add_sheet("EGAIN_rmse")
    sh_acc_dct_egain = wb_egain.add_sheet("EGAIN_acc_dct")
    sh_acc_knn_egain = wb_egain.add_sheet("EGAIN_acc_knn")
    sh_acc_nb_egain = wb_egain.add_sheet("EGAIN_acc_nb")
    sh_acc_lr_egain = wb_egain.add_sheet("EGAIN_acc_lr")

    wb_mean = xlwt.Workbook()
    sh_rmse_mean = wb_mean.add_sheet("MEAN_rmse")
    sh_acc_dct_mean = wb_mean.add_sheet("MEAN_acc_dct")
    sh_acc_knn_mean = wb_mean.add_sheet("MEAN_acc_knn")
    sh_acc_nb_mean = wb_mean.add_sheet("MEAN_acc_nb")
    sh_acc_lr_mean = wb_mean.add_sheet("MEAN_acc_lr")

    wb_knn = xlwt.Workbook()
    sh_rmse_knn = wb_knn.add_sheet("KNN_rmse")
    sh_acc_dct_knn = wb_knn.add_sheet("KNN_acc_dct")
    sh_acc_knn_knn = wb_knn.add_sheet("KNN_acc_knn")
    sh_acc_nb_knn = wb_knn.add_sheet("KNN_acc_nb")
    sh_acc_lr_knn = wb_knn.add_sheet("KNN_acc_lr")

    for k in range(len(data_names)):

        data_name = data_names[k]
        gain_parameters = {
            'batch_size': batch_size,
            'alpha': alpha,
            'iterations': iterations
        }
        ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate)

        print("Dataset: ", data_name)

        ###########################Mean imputation#################################
        print('Mean imputation')
        sh_rmse_mean.write(0, k, data_name)
        sh_acc_dct_mean.write(0, k, data_name)
        sh_acc_knn_mean.write(0, k, data_name)
        sh_acc_nb_mean.write(0, k, data_name)
        sh_acc_lr_mean.write(0, k, data_name)

        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputed_data_x = imp.fit_transform(miss_data_x)

        sh_rmse_mean.write(
            1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4))

        # Normalize data and classification
        # imputed_data_x, _ = normalization(imputed_data_x)
        #
        # scf = StratifiedShuffleSplit(n_splits=10)
        # # DCT classifier
        # score_dct_mean = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_dct_mean.write(1, k, np.round(np.mean(score_dct_mean), 4))
        # # KNN classifier
        # score_knn_mean = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_knn_mean.write(1, k, np.round(np.mean(score_knn_mean), 4))
        # # NB classifier
        # score_nb_mean = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_nb_mean.write(1, k, np.round(np.mean(score_nb_mean), 4))
        # # LR classifier
        # score_lr_mean = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
        #                                 scoring='accuracy')
        # sh_acc_lr_mean.write(1, k, np.round(np.mean(score_lr_mean), 4))

        ###########################KNN imputation#################################
        print('KNN imputation')
        sh_rmse_knn.write(0, k, data_name)
        sh_acc_dct_knn.write(0, k, data_name)
        sh_acc_knn_knn.write(0, k, data_name)
        sh_acc_nb_knn.write(0, k, data_name)
        sh_acc_lr_knn.write(0, k, data_name)

        imp = KNNImputer(missing_values=np.nan)
        imputed_data_x = imp.fit_transform(miss_data_x)

        sh_rmse_knn.write(
            1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4))

        # # Normalize data and classification
        # imputed_data_x, _ = normalization(imputed_data_x)
        #
        # scf = StratifiedShuffleSplit(n_splits=10)
        # # DCT classifier
        # score_dct_knn = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_dct_knn.write(1, k, np.round(np.mean(score_dct_knn), 4))
        # # KNN classifier
        # score_knn_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_knn_knn.write(1, k, np.round(np.mean(score_knn_knn), 4))
        # # NB classifier
        # score_nb_knn = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_nb_knn.write(1, k, np.round(np.mean(score_nb_knn), 4))
        # # LR classifier
        # score_lr_knn = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
        #                                 scoring='accuracy')
        # sh_acc_lr_knn.write(1, k, np.round(np.mean(score_lr_knn), 4))

        ###########################GAIN imputation#################################
        print('GAIN imputation')
        sh_rmse_gain.write(0, k, data_name)
        sh_acc_dct_gain.write(0, k, data_name)
        sh_acc_knn_gain.write(0, k, data_name)
        sh_acc_nb_gain.write(0, k, data_name)
        sh_acc_lr_gain.write(0, k, data_name)
        for i in tqdm(range(n_times)):
            # Impute missing data
            imputed_data_x = gain(miss_data_x, gain_parameters)
            sh_rmse_gain.write(
                i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m),
                                   4))

            # #Normalize data and classification
            # imputed_data_x,_ = normalization(imputed_data_x)
            #
            # scf = StratifiedShuffleSplit(n_splits=10)
            # #DCT classifier
            # score_dct_gain = cross_val_score(DecisionTreeClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_dct_gain.write(i+1, k, np.round(np.mean(score_dct_gain), 4))
            # #KNN classifier
            # score_knn_gain = cross_val_score(KNeighborsClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_knn_gain.write(i+1, k, np.round(np.mean(score_knn_gain), 4))
            # #NB classifier
            # score_nb_gain = cross_val_score(GaussianNB(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_nb_gain.write(i+1, k, np.round(np.mean(score_nb_gain), 4))
            # #LR classifier
            # score_lr_gain = cross_val_score(LogisticRegression(max_iter=1000),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_lr_gain.write(i+1, k, np.round(np.mean(score_lr_gain), 4))

        ###########################EGAIN imputation#################################
        print('EGAIN imputation')
        sh_rmse_egain.write(0, k, data_name)
        sh_acc_dct_egain.write(0, k, data_name)
        sh_acc_knn_egain.write(0, k, data_name)
        sh_acc_nb_egain.write(0, k, data_name)
        sh_acc_lr_egain.write(0, k, data_name)

        for i in tqdm(range(n_times)):

            imputed_data_x = Egain(miss_data_x, gain_parameters)
            sh_rmse_egain.write(
                i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m),
                                   4))

            # Normalize data and classification
            # imputed_data_x, _ = normalization(imputed_data_x)
            #
            # scf = StratifiedShuffleSplit(n_splits=10)
            # # DCT classifier
            # score_dct_egain = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_dct_egain.write(i + 1, k, np.round(np.mean(score_dct_egain), 4))
            # # KNN classifier
            # score_knn_egain = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_knn_egain.write(i + 1, k, np.round(np.mean(score_knn_egain), 4))
            # # NB classifier
            # score_nb_egain = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_nb_egain.write(i + 1, k, np.round(np.mean(score_nb_egain), 4))
            # # LR classifier
            # score_lr_egain = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
            #                                 scoring='accuracy')
            # sh_acc_lr_egain.write(i + 1, k, np.round(np.mean(score_lr_egain), 4))

    wb_gain.save('GAIN_test.xls')
    wb_egain.save('EGAIN_test.xls')
    wb_mean.save('MEAN_test.xls')
    wb_knn.save('KNN_test.xls')
Example #13
0
    [VOLUME, G4],
    [0, C5],
    [VOLUME, G4],
    [VOLUME, C5],
    [VOLUME, F5],
    [VOLUME, E5],
    [VOLUME, C5],
]

from fade import fade
from gain import gain
from repeat import repeat
from square import square_wave

all_samples = []
quarter_second = 44100 // 4
for volume, frequency in notes:
    samples = square_wave(int(44100 / frequency // 2))
    samples = gain(samples, volume)
    samples = repeat(samples, quarter_second)
    samples = fade(samples, quarter_second)
    all_samples.extend(samples)

all_samples = [int(sample) for sample in all_samples]

w = wave.open('music.wav', 'wb')
w.setnchannels(1)
w.setsampwidth(2)
w.setframerate(44100)
w.writeframes(struct.pack('<' + 'h' * len(all_samples), *all_samples))
Example #14
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
    print()
    mi_data = miss_data_x.astype(float)
    no, dim = imputed_data_x.shape
    miss_data = np.reshape(mi_data, (no, dim))
    np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f')
    print('Shape of miss data: ', miss_data.shape)
    print('Save results in missing_data.csv')

    print()
    print('=== GAIN RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse, 6)))
    #print('Kích thước của file đầu ra: ', imputed_data_x.shape)
    np.savetxt("data/imputed_data.csv",
               imputed_data_x,
               delimiter=',',
               fmt='%d')
    print('Save results in Imputed_data.csv')

    # MissForest

    print()
    print('=== MissForest RMSE ===')
    data = miss_data_x
    imp_mean = MissForest(max_iter=5)
    miss_f = imp_mean.fit_transform(data)
    #miss_f = pd.DataFrame(imputed_train_df)
    rmse_MF = rmse_loss(ori_data_x, miss_f, data_m)
    print('RMSE Performance: ' + str(np.round(rmse_MF, 6)))
    np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MF.csv')

    # MICE From Auto Impute
    print()
    print('=== MICE of Auto Impute RMSE ===')
    data_mice = pd.DataFrame(miss_data_x)
    mi = MiceImputer(k=1,
                     imp_kwgs=None,
                     n=1,
                     predictors='all',
                     return_list=True,
                     seed=None,
                     strategy='default predictive',
                     visit='default')
    mice_out = mi.fit_transform(data_mice)
    c = [list(x) for x in mice_out]
    c1 = c[0]
    c2 = c1[1]
    c3 = np.asarray(c2)
    mice_x = c3
    #print('here :', mice_x, miss_f, miss_f.shape)
    rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m)
    print('=== MICE of Auto Impute RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse_MICE, 6)))
    np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MICE.csv')

    return imputed_data_x, rmse
Example #15
0
for k in range(0, input_array_tmp.shape[0]):
  n_filled = np.sum([1 \
   for x in input_array_tmp[k,:] \
   if x is not None])
  n_total = input_array_tmp.shape[1]
  print(n_filled / n_total)
  if n_filled / n_total < 0.5: continue
  input_array.append(input_array_tmp[k,:])
  
input_array = np.asarray(input_array)
input_shape = input_array.shape

print('Feature array shape: %s' % str(feature_shape))
print('Input shape for imputer: %s' % str(input_shape))

imputed_array = gain(\
  input_array.astype(np.float), gain_parameters)

from fancyimpute import KNN

imp_mean = KNN(5) # IterativeImputer()
imp_mean.fit_transform(input_array) 
#imp_mean.transform(input_array)

df = pd.DataFrame(imputed_array)

plt.matshow(df.corr())
#plt.title('Correlations between laboratory parameters over time', y=-0.01)
plt.xticks(range(0,len(feature_rows)), feature_rows, rotation='vertical',fontsize='6')
plt.yticks(range(0,len(feature_rows)), feature_rows, fontsize='6')
plt.gca().xaxis.set_ticks_position('top')
plt.colorbar()
Example #16
0
def main():
    # data_names = ['letter', 'spam','credit','breast']
    # data_names = ['spam', 'letter','breast','banknote']
    data_names = ['parkinsons']
    # data_names = ['breast','banknote','connectionistvowel']
    # data_names = ['connectionistvowel']
    # data_names = ['parkinsons','seedst']
    # data_names = ['breasttissue','glass', 'thyroid']
    # data_names = ['credit', 'breast', 'balance','banknote','blood','climate','connectionistvowel',
    #               'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst',
    #               'thyroid','vehicle','vertebral','wine','yeast']
    # data_names = ['parkinsons']
    # data_names = ['balance','banknote','blood','connectionistvowel','vehicle','yeast']
    miss_rate = 0.1
    batch_size = 128
    alpha = 100
    iterations = 10000
    n_times = 30
    gain_parameters = {
        'batch_size': batch_size,
        'alpha': alpha,
        'iterations': iterations
    }
    wb = xlwt.Workbook()
    sh_dct_mgain = wb.add_sheet("DCT_mgain")
    sh_mlp_mgain = wb.add_sheet("MLP_mgain")
    sh_dct_gain = wb.add_sheet("DCT_gain")
    sh_mlp_gain = wb.add_sheet("MLP_gain")

    for k in range(len(data_names)):
        data_name = data_names[k]
        sh_dct_mgain.write(0, k, data_name)
        sh_mlp_mgain.write(0, k, data_name)

        sh_dct_gain.write(0, k, data_name)
        sh_mlp_gain.write(0, k, data_name)

        print("Dataset: ", data_name)
        ori_data_x, y = data_loader(data_name)
        train_idx, test_idx = train_test_split(range(len(y)),
                                               test_size=0.3,
                                               stratify=y,
                                               random_state=42)
        for i in tqdm(range(n_times)):
            miss_data_x, m = make_missing_data(ori_data_x, miss_rate, seed=i)

            # Impute missing data
            # 20 imputed data for MGAIN
            # 1 imputed data for GAIN
            imputed_data_xs = gain(miss_data_x, gain_parameters, n_times=21)
            imputed_data_xs = np.array(imputed_data_xs)

            # Normalize data
            imputed_data_xs = np.array(
                [normalization(data)[0] for data in imputed_data_xs])

            #classify
            num_cores = multiprocessing.cpu_count()

            # Training with MLP
            # 20 for esemble learning MGAIN
            # 1 for GAIN
            clfs = Parallel(n_jobs=num_cores)(
                delayed(train_MLP)(imputed_data_xs[i][train_idx], y[train_idx])
                for i in range(imputed_data_xs.shape[0]))

            # Testing
            # MGAIN
            x_test = np.mean(imputed_data_xs[1:, test_idx, :], axis=0)
            ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test)
                                            for clf in clfs[1:])
            ys = np.array(ys)
            # voting
            y_test = [
                max(set(list(ys[:, i])), key=list(ys[:, i]).count)
                for i in range(ys.shape[1])
            ]
            score = accuracy_score(y[test_idx], y_test)
            sh_mlp_mgain.write(i + 1, k, np.round(score, 4))

            score = accuracy_score(
                y[test_idx], predict(clfs[0], imputed_data_xs[0][test_idx]))
            sh_mlp_gain.write(i + 1, k, np.round(score, 4))

            # # Training with LR
            # # 20 for esemble learning MGAIN
            # # 1 for GAIN
            # clfs = Parallel(n_jobs=num_cores)(delayed(train_LR)(imputed_data_xs[i][train_idx], y[train_idx])
            #                                   for i in range(imputed_data_xs.shape[0]))
            #
            # # Testing
            # # MGAIN
            # x_test = np.mean(imputed_data_xs[1:,test_idx,:],axis=0)
            # ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test)
            #                                 for clf in clfs[1:])
            # ys = np.array(ys)
            # # voting
            # y_test = [max(set(list(ys[:,i])),key=list(ys[:,i]).count) for i in range(ys.shape[1])]
            # score = accuracy_score(y[test_idx],y_test)
            # sh_lr_mgain.write(i+1,k,np.round(score,4))
            #
            # score = accuracy_score(y[test_idx], predict(clfs[0],imputed_data_xs[0][test_idx]))
            # sh_lr_gain.write(i + 1, k, np.round(score, 4))

            # Training with DCT
            # 20 for esemble learning MGAIN
            # 1 for GAIN
            clfs = Parallel(n_jobs=num_cores)(
                delayed(train_DCT)(imputed_data_xs[i][train_idx], y[train_idx])
                for i in range(imputed_data_xs.shape[0]))

            # Testing
            # MGAIN
            x_test = np.mean(imputed_data_xs[1:, test_idx, :], axis=0)
            ys = Parallel(n_jobs=num_cores)(delayed(predict)(clf, x_test)
                                            for clf in clfs[1:])
            ys = np.array(ys)
            # voting
            y_test = [
                max(set(list(ys[:, i])), key=list(ys[:, i]).count)
                for i in range(ys.shape[1])
            ]
            score = accuracy_score(y[test_idx], y_test)
            sh_dct_mgain.write(i + 1, k, np.round(score, 4))

            score = accuracy_score(
                y[test_idx], predict(clfs[0], imputed_data_xs[0][test_idx]))
            sh_dct_gain.write(i + 1, k, np.round(score, 4))

    wb.save("./final_results/Mgain_10_parkinsons.xls")
Example #17
0
def main (alpha=1000, batch_size=128, hint_rate=0.5, 
  iterations=900, miss_rate=0.3):
  
  gain_parameters = {'batch_size': batch_size,
                     'hint_rate': hint_rate,
                     'alpha': alpha,
                     'iterations': iterations}
  
  # Load data and introduce missingness
  #file_name = 'data/spam.csv'
  #data_x = np.loadtxt(file_name, delimiter=",", skiprows=1)
  
  enable_transform = False
  remove_outliers = False
  n_time_points = 3
  
  data_x = pickle.load(open('./missing_data.sav', 'rb'))
  data_x = data_x.transpose().astype(np.float)[:,:]
  print(data_x.shape)
  # if remove_outliers:
  #  data_x = pickle.load(open('./missing_data.sav', 'rb'))
  #  data_x = data_x.transpose().astype(np.float)
  # else:
  #  data_x = pickle.load(open('./denoised_missing_data.sav', 'rb')) 

  signed_variables = ['base_excess']
  no, dim = data_x.shape
  
  data_x_encoded = np.copy(data_x)
  miss_data_x = np.copy(data_x)
  miss_data_x_enc = np.copy(data_x)
  
  scalers = []
  
  for i in range(0, dim):
      variable, var_x = variables[i], np.copy(data_x[:,i])
      encoder_model = encoders[i]
      # Exclude outliers based on error
      nn_indices = ~np.isnan(data_x_encoded[:,i])
      nn_values = data_x[:,i][nn_indices]

      scaler = MinMaxScaler()
      var_x_scaled = scaler.fit_transform(var_x.reshape((-1,1)))
    
      enc_x_scaled = encoder_model.predict(var_x_scaled)
      enc_x_unscaled = scaler.inverse_transform(enc_x_scaled)
      data_x_encoded[:,i] = enc_x_unscaled.flatten()
      
      scalers.append(scaler)
      
      if remove_outliers:
        print('Excluding outliers...')
        mse = np.mean(np.power(var_x.reshape((-1,1)) - enc_x_unscaled, 2),axis=1)
      
        x = np.ma.array(mse, mask=np.isnan(mse))
        y = np.ma.array(var_x, mask=np.isnan(var_x))
        outlier_indices = (x / np.max(y)) > 2
        
        outlier_values = var_x[outlier_indices]
        
        print('... %d outlier(s) excluded' % \
          len(outlier_values), outlier_values)
        
        miss_data_x[outlier_indices == True,i] = np.nan
        miss_data_x_enc[outlier_indices == True,i] = np.nan
      
      #print(var_x, '----', enc_x_scaled, '----', enc_x_unscaled.flatten())
      print('Loaded model for %s...' % variable)
  
  no_total = no * dim
  no_nan = np.count_nonzero(np.isnan(data_x.flatten()) == True)
  no_not_nan = no_total - no_nan
  print('Input shape', no, 'x', dim)
  print('NAN values:', no_nan, '/', no_total, \
    '%2.f%%' % (no_nan / no_total * 100))

  n_patients = int(no/n_time_points)

  if len(variables) != dim:
    print(len(variables), dim)
    print('Incompatible dimensions.')
    exit()
  
  if enable_transform:  
    print('Applying transformation...')
    transformer = MinMaxScaler()
    transformer.fit(data_x)
  
    #data_x = transformer.transform(data_x)
    #miss_data_x = transformer.transform(miss_data_x)
    miss_data_x_enc = transformer.transform(data_x_encoded)
  
  # Introduce missing data
  data_m = binary_sampler(1-miss_rate, no, dim)

  miss_data_x[data_m == 0] = np.nan
  miss_data_x_enc[data_m == 0] = np.nan

  no_nan = np.count_nonzero(np.isnan(miss_data_x.flatten()) == True)
  no_not_nan = no_total - no_nan

  print('After removal, NAN values:', no_nan, '/', no_total, \
    '%2.f%%' % (no_nan / no_total * 100))
  
  real_miss_rate = (no_nan / no_total * 100)
  
  imputed_data_x_gan = gain(
    miss_data_x_enc, gain_parameters)
  
  # n_gans = 3
  # idxg_combined = []
  # 
  # for  n_gan in range(0, n_gans):
  #   np.random.seed(n_gan + 1)
  #   idxg_combined.append(gain(miss_data_x_enc, gain_parameters))
  # 
  # idxg_combined = np.concatenate(idxg_combined)
  #   
  # idxg_combined_final = gain(
  #   miss_data_x_enc, gain_parameters)
  # 
  # for j in range(0, dim):
  #   idxg_combined_tmp = np.copy(idxg_combined)
  #   
  #   for i in range(0, n_patients * n_time_points):
  #     if np.isnan(miss_data_x[i,j]) and data_m[i,j] != 0:
  #       idxg_combined_tmp[i,j] = np.nan
  # 
  #   imputer = IterativeImputer() # KNNImputer(n_neighbors=5)
  #   idxg_knn = imputer.fit_transform(idxg_combined_tmp)
  #   idxg_combined_final[:,j] = idxg_knn[0:n_patients*n_time_points,j]
  #   print('Done KNN imputation #%d' % j)
  # 
  # imputed_data_x_gan = idxg_combined_final

  imputer = KNNImputer(n_neighbors=5)
  imputed_data_x_knn = imputer.fit_transform(miss_data_x)
  
  imputer = IterativeImputer()
  imputed_data_x_mice = imputer.fit_transform(miss_data_x)
  
  if enable_transform:
    #data_x = transformer.inverse_transform(data_x)
    #miss_data_x = transformer.inverse_transform(miss_data_x)
    imputed_data_x_gan = transformer.inverse_transform(imputed_data_x_gan)
    #imputed_data_x_knn = transformer.inverse_transform(imputed_data_x_knn)
    #imputed_data_x_mice = transformer.inverse_transform(imputed_data_x_mice)
  
  # Save imputed data to disk
  pickle.dump(imputed_data_x_gan,open('./filled_data.sav', 'wb'))
  
  # Get residuals for computation of stats
  distances_gan = np.zeros((dim, n_time_points*n_patients))
  distances_knn = np.zeros((dim, n_time_points*n_patients))
  distances_mice = np.zeros((dim, n_time_points*n_patients))

  for i in range(0, n_patients):
    for j in range(0, dim):
      variable_name = variables[j]
      i_start = int(i*n_time_points)
      i_stop = int(i*n_time_points+n_time_points)
      
      original_tuple = data_x[i_start:i_stop,j]
      corrupted_tuple = miss_data_x[i_start:i_stop,j]
      imputed_tuple_gan = imputed_data_x_gan[i_start:i_stop,j]
      imputed_tuple_knn = imputed_data_x_knn[i_start:i_stop,j]
      imputed_tuple_mice = imputed_data_x_mice[i_start:i_stop,j]
      
      if i == 1 or i == 2:
        print(original_tuple, corrupted_tuple, imputed_tuple_gan, imputed_tuple_knn)
      for k in range(0, n_time_points):
        a, b, c, d = original_tuple[k], imputed_tuple_gan[k], imputed_tuple_knn[k], imputed_tuple_mice[k]
        if np.isnan(a) or data_m[i_start+k,j] != 0: continue
        if i % 10 == 0: print(variable_name, a,b,c,d, b-a)
        distances_gan[j,i*k] = (b - a)
        distances_knn[j,i*k] = c - a
        distances_mice[j,i*k] = d - a
  
  # Compute distance statistics
  rrmses_gan, mean_biases, median_biases, bias_cis = [], [], [], []
  rrmses_knn, mean_biases_knn, median_biases_knn, bias_cis_knn = [], [], [], []

  for j in range(0, dim):
    
    # Stats for original data
    dim_mean = np.mean([x for x in data_x[:,j] if not np.isnan(x)])
    dim_max = np.max([x for x in data_x[:,j] if not np.isnan(x)])

    dists_gan = distances_gan[j]
    dists_knn = distances_knn[j]
    dists_mice = distances_mice[j]
    
    #dists_gan /= dim_max
    #dists_knn /= dim_max
    #dists_mice /= dim_max
    
    # Stats for GAN
    mean_bias = np.round(np.mean(dists_gan), 4)
    median_bias = np.round(np.median(dists_gan), 4)
    mean_ci_95 = mean_confidence_interval(dists_gan)
    rmse = np.sqrt(np.mean(dists_gan**2))
    rrmse = np.round(rmse / dim_mean * 100, 2)
    
    bias_cis.append([mean_ci_95[1], mean_ci_95[2]])
    mean_biases.append(mean_bias)
    median_biases.append(median_bias)
    rrmses_gan.append(rrmse)
    
    # Stats for KNN
    rmse_knn = np.sqrt(np.mean(dists_knn**2))
    rrmses_knn = np.round(rmse_knn / dim_mean * 100, 2)
    
    # Stats for MICE
    rmse_mice = np.sqrt(np.mean(dists_mice**2))
    rrmses_mice = np.round(rmse_mice / dim_mean * 100, 2)
    
    print(variables[j], ' - rrmse: ', rrmse, 'median bias: %.2f' % median_bias,
      '%%, bias: %.2f (95%% CI, %.2f to %.2f)' % mean_ci_95)

  n_fig_rows = 6
  n_fig_cols = 6

  n_fig_total = n_fig_rows * n_fig_cols

  if dim > n_fig_total:
    print('Warning: not all variables plotted')

  fig, axes = plt.subplots(\
    n_fig_rows, n_fig_cols, figsize=(15,15))
  fig2, axes2 = plt.subplots(\
    n_fig_rows, n_fig_cols, figsize=(15,15))

  for j in range(0, dim):
    
    ax_title = variables[j]
    ax = axes[int(j/n_fig_cols), j % n_fig_cols]
    ax2 = axes2[int(j/n_fig_cols), j % n_fig_cols]
    ax.set_title(ax_title,fontdict={'fontsize':6})

    input_arrays = [data_x, imputed_data_x_gan, imputed_data_x_knn, imputed_data_x_mice]
    
    output_arrays = [
      np.asarray([input_arr[ii,j] for ii in range(0, no) if \
        (not np.isnan(data_x[ii,j]) and \
        data_m[ii,j] == 0)]) for input_arr in input_arrays
    ]
    
    deleted_values, imputed_values_gan, imputed_values_knn, imputed_values_mice = output_arrays
    
    # Make KDE
    low_ci, high_ci = bias_cis[j]
    xlabel = 'mean bias = %.2f (95%% CI, %.2f to %.2f)' % \
      (mean_biases[j], low_ci, high_ci)
      
    ax.set_xlabel(xlabel, fontsize=6)
    ax.set_ylabel('$p(x)$',fontsize=6)
    
    range_arrays = np.concatenate([deleted_values, imputed_values_gan])
    
    x_range = (np.min(range_arrays), 
      np.min([
        np.mean(range_arrays) + 3 * np.std(range_arrays), 
        np.max(range_arrays)
      ])
    )
    
    kde_kws = { 'shade': False, 'bw':'scott', 'clip': x_range }
    
    sns.distplot(imputed_values_gan, hist=False,
      kde_kws={**{ 'color': 'r'}, **kde_kws}, ax=ax)
    
    sns.distplot(imputed_values_knn, hist=False,
      kde_kws={**{ 'color': 'b', 'alpha': 0.5 }, **kde_kws},ax=ax)

    sns.distplot(imputed_values_mice, hist=False,
      kde_kws={**{ 'color': 'g', 'alpha': 0.5 }, **kde_kws},ax=ax)

    sns.distplot(deleted_values, hist=False,
      kde_kws={**{ 'color': '#000000'}, **kde_kws},ax=ax)

    # Make QQ plot
    qqplot(deleted_values, imputed_values_gan, ax=ax2, color='r')
    qqplot(deleted_values, imputed_values_knn, ax=ax2, color='b')
    qqplot(deleted_values, imputed_values_mice, ax=ax2, color='g')
    
  top_title = 'KDE plot of original data (black) and data imputed using GAN (red) and KNN (blue)'
  fig.suptitle(top_title, fontsize=8)
  fig.legend(labels=['GAN', 'KNN', 'MICE', 'Observed'])

  fig.tight_layout(rect=[0,0.03,0,1.25])
  fig.subplots_adjust(hspace=1, wspace=0.35)

  top_title = 'Q-Q plot of observed vs. predicted values'
  fig2.suptitle(top_title, fontsize=8)

  fig2.tight_layout(rect=[0,0.03,0,1.25])
  fig2.subplots_adjust(hspace=1, wspace=0.35)
  
  plt.show()

  print()
  mrrmse_gan = np.round(np.asarray(rrmses_gan).mean(), 2)
  print('Average RMSE (GAN): ', mrrmse_gan, '%')

  print()
  mrrmse_knn = np.round(np.asarray(rrmses_knn).mean(), 2)
  print('Average RMSE (KNN): ', mrrmse_knn, '%')

  print()
  mrrmse_mice = np.round(np.asarray(rrmses_mice).mean(), 2)
  print('Average RMSE (MICE): ', mrrmse_mice, '%')
  
  return real_miss_rate, mrrmse_gan, mrrmse_knn, mrrmse_mice