Esempio n. 1
0
def main_ide(base_path, params):
    '''Main function for UCI letter and spam datasets.

    Args:
      - data_name: letter or spam
      - miss_rate: probability of missing components
      - batch:size: batch size
      - hint_rate: hint rate
      - alpha: hyperparameter
      - iterations: iterations

    Returns:
      - imputed_data_x: (Dict) imputed data
      - rmse: Root Mean Squared Error
    '''
    base_path = base_path
    data_name = params['data_name']
    miss_rate = params['miss_rate']

    gain_parameters = {
        'batch_size': params['batch_size'],
        'hint_rate': params['hint_rate'],
        'alpha': params['alpha'],
        'iterations': params['iterations']
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(base_path, data_name,
                                                  miss_rate)

    # Impute missing data
    imputed_data_x = {}
    imputed_data_x['GAIN'] = gain(miss_data_x, gain_parameters)
    imputed_data_x['Median'] = Impute_med(miss_data_x)
    imputed_data_x['EM'] = Impute_EM(miss_data_x)

    # Report the RMSE performance
    rmse_dict = {}
    rmse_dict['GAIN'] = rmse_loss(ori_data_x, imputed_data_x['GAIN'], data_m)
    rmse_dict['Median'] = rmse_loss(ori_data_x, imputed_data_x['Median'],
                                    data_m)
    rmse_dict['EM'] = rmse_loss(ori_data_x, imputed_data_x['EM'], data_m)

    print()
    print('Parameters:')
    print(params)
    print('RMSE Performance:')
    print(rmse_dict)

    return imputed_data_x, data_m, rmse_dict
Esempio n. 2
0
def evaluation_step(generator, data_m, norm_data_x, data_x, ori_data_x,
                    normalizer):
    """
        The validation schema is absent in the original paper implementation
        We will use for convenience the RMSE Value that is used as a Metric
        to perform Early Stopping and monitor the During-Training Performance of the Model
    """
    Z_mb = uniform_sampler(0, 0.01, data_m.shape[0], data_m.shape[1])
    Z_mb = Z_mb.astype('float32')
    M_mb = data_m
    M_mb = M_mb.astype('float32')
    X_mb = norm_data_x
    X_mb = X_mb.astype('float32')
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb

    imputed_data = generator.predict(
        tf.concat([X_mb.values, M_mb.values], axis=1))[0]
    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data

    # Renormalization
    imputed_data = normalizer.denormalize(imputed_data)

    # Rounding
    imputed_data_values = rounding(imputed_data.values, data_x.values)

    rmse = rmse_loss(ori_data_x.values, imputed_data_values, data_m.values)
    imputed_and_rounded_df_to_use_for_downstream_task = pd.DataFrame(
        data=imputed_data_values, columns=imputed_data.columns)
    return rmse, imputed_and_rounded_df_to_use_for_downstream_task
Esempio n. 3
0
def main (args):
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    data_name = args.data_name
    miss_rate = args.miss_rate
    
    gain_parameters = {'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'beta': args.beta,
        'lambda_': args.lambda_,
        'k': args.k,
        'iterations': args.iterations,
        'cluster_species':args.cluster_species}
    
    # Load data and introduce missingness
    data_x, miss_data_x, data_M = data_loader(data_name, miss_rate)
    
    row , col = miss_data_x.shape
    five_len = row//5           
    ## 5-cross validations impute missing data
    for i in range(5):
        incomplete_data_x = np.vstack((miss_data_x[0:i*five_len:,] , miss_data_x[(i+1)*five_len:row:,]))

        complete_data_x = np.vstack((data_x[0:i*five_len:,] , data_x[(i+1)*five_len:row:,]))
        
        data_m = np.vstack((data_M[0:i*five_len:,] , data_M[(i+1)*five_len:row:,]))
        imputed_data = PC_GAIN(incomplete_data_x , gain_parameters , data_m)
        
        rmse = str(np.round(rmse_loss (complete_data_x, imputed_data, data_m), 4))
        print('RMSE Performance: ',rmse)
Esempio n. 4
0
def test(data_m, data_x, dim, generator, no, norm_data_x, norm_parameters, ori_data_x, test_index):
    # Return imputed data
    Z_mb = uniform_sampler(0, 0.01, no, dim)
    M_mb = data_m
    X_mb = norm_data_x
    X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb
    imputed_data = generator(torch.Tensor(X_mb), torch.Tensor(M_mb)).detach().numpy()
    imputed_data = data_m * norm_data_x + (1 - data_m) * imputed_data
    # Renormalization
    imputed_data = renormalization(imputed_data, norm_parameters)
    # Rounding
    imputed_data = rounding(imputed_data, data_x)
    rmse, rmse_mean = rmse_loss(ori_data_x[test_index], imputed_data[test_index], data_m[test_index])
    rmse_full, rmse_full_mean = rmse_loss(ori_data_x, imputed_data, data_m)
    print(f'RMSE Performance (mean): {rmse_mean:.4f} (test), {rmse_full_mean:.4f} (full).')
    # print(f'RMSE Performance: {rmse:.4f} (test), {rmse_full:.4f} (full).')
    return rmse
Esempio n. 5
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
    mae = mae_loss(ori_data_x, imputed_data_x, data_m)

    print()
    print('RMSE Performance: ' + str(np.round(rmse, 4)))
    print('MAE Performance: ' + str(np.round(mae, 4)))

    return imputed_data_x, rmse
Esempio n. 6
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate
    random = args.seed
    time = args.time

    x1 = data_name
    x = x1.split("+")
    print(x)
    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations,
        'time': args.time
    }
    # Load data and introduce missingness

    #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random)
    miss_rate_caption = "{}% Missing".format(int(miss_rate * 100))
    col1 = [
        miss_rate_caption, 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE', 'RMSE',
        'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', 'RMSPE', '', 'MLP', 'MLP',
        'D.Tree', 'D.Tree', 'LogisticR', 'LogisticR', 'LogisticR', 'LogisticR',
        'LogisticR', 'LogisticR', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC', 'SVC',
        'SGD', 'SGD', 'SGD', 'SGD', 'SGD', 'SGD'
    ]
    col2 = [
        'Method', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN',
        'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', '', 'EGAIN', 'GAIN', 'EGAIN',
        'GAIN', 'EGAIN', 'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN',
        'GAIN', 'MEAN', 'KNN', 'MICE', 'M.FORE', 'EGAIN', 'GAIN', 'MEAN',
        'KNN', 'MICE', 'M.FORE'
    ]
    result = [col1, col2]

    for data_train in x:
        data_name = data_train
        dataset = [
            'obesity', 'hepatitisC', 'audit', 'letter', 'spam', 'breast',
            'credit', 'news', 'blood', 'vowel', 'ecoli', 'ionosphere',
            'parkinsons', 'seedst', 'vehicle', 'vertebral', 'wine', 'banknote',
            'balance', 'yeast', 'bean', 'shill', 'phishing', 'firewall',
            'iBeacon', 'steel'
        ]
        if (data_name not in dataset):
            print("Wrong name: {} Dataset. Skip this datasets".format(
                data_train))
            break
        col3 = []
        col3.append(data_name)

        print("****** {} Dataset ******".format(data_train))
        gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[]
        gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[]
        knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], []
        gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe =  [],[],[],[],[],[]
        knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd =    [],[],[],[],[],[]
        miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[]

        for i in range(time):
            # Load data and introduce missingness
            # Fix loader i=42
            ori_data_x, miss_data_x, data_m, y = data_loader3(
                data_name, miss_rate, 42)  # 7) #i) block i
            train_idx, test_idx = train_test_split(
                range(len(y)), test_size=0.2, stratify=y,
                random_state=i)  #7) #i) block i
            miss_data_x2 = miss_data_x  #* 10000
            if i % 5 == 0:
                print('=== Working on {}/{} ==='.format(i, time))

            # Impute missing data
            imputed_data_x1 = gain(miss_data_x2, gain_parameters)
            imputed_data_x_e1 = egain(miss_data_x2, gain_parameters)
            imputed_data_x = imputed_data_x1  #* 1/10000
            imputed_data_x_e = imputed_data_x_e1  #* 1/10000

            imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean')
            imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x)
            imputed_data_x_mean = imputed_data_x_mean.round()
            #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2)  *1/10000

            imp_KNN = KNNImputer(missing_values=np.nan, n_neighbors=3)
            imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x)  # *1/10000
            imputed_data_x_knn = imputed_data_x_knn.round()

            # ExtraTreesRegressor: similar to missForest in R; DecisionTreeRegressor()
            imp_mf = IterativeImputer(estimator=ExtraTreesRegressor(),
                                      max_iter=1,
                                      initial_strategy="constant",
                                      n_nearest_features=1,
                                      imputation_order='descending')  #20
            imputed_data_mf = imp_mf.fit_transform(miss_data_x)  #*1/10000
            imputed_data_mf = imputed_data_mf.round()
            #imp_mf = MissForest(max_iter=1)
            #imputed_data_mf = imp_mf.fit_transform(miss_data_x)

            imp_mice = IterativeImputer(
                estimator=BayesianRidge(),
                max_iter=1,
                initial_strategy='constant',
                n_nearest_features=1,
                imputation_order='descending')  # 'mean') #20
            imputed_data_mice = imp_mice.fit_transform(miss_data_x)  #*1/10000
            imputed_data_mice = imputed_data_mice.round()

            # Report the RMSE performance
            rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
            rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m)
            rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m)
            rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m)
            rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m)
            rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m)

            gan_rs.append(rmse)
            egain_rs.append(rmse_e)

            mean_rmse.append(rmse_mean)
            knn_rmse.append(rmse_knn)
            mice_rmse.append(rmse_mice)
            miss_rmse.append(rmse_mf)

            # Report the RMSPE performance
            rmspe = rmspe_loss(ori_data_x, imputed_data_x, data_m)
            rmspe_e = rmspe_loss(ori_data_x, imputed_data_x_e, data_m)
            rmspe_mean = rmspe_loss(ori_data_x, imputed_data_x_mean, data_m)
            rmspe_knn = rmspe_loss(ori_data_x, imputed_data_x_knn, data_m)
            rmspe_mf = rmspe_loss(ori_data_x, imputed_data_mf, data_m)
            rmspe_mice = rmspe_loss(ori_data_x, imputed_data_mice, data_m)

            #gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe
            gan_rmspe.append(rmspe)
            egan_rmspe.append(rmspe_e)
            mean_rmspe.append(rmspe_mean)
            knn_rmspe.append(rmspe_knn)
            mice_rmspe.append(rmspe_mice)
            miss_rmspe.append(rmspe_mf)

            mi_data = miss_data_x.astype(float)
            no, dim = imputed_data_mice.shape
            miss_data = np.reshape(mi_data, (no, dim))
            np.savetxt("data/{}missing_data.csv".format(i),
                       mi_data,
                       delimiter=',',
                       fmt='%1.2f')
            np.savetxt("data/{}imputed_data_gain.csv".format(i),
                       imputed_data_x,
                       delimiter=',',
                       fmt='%d')
            np.savetxt("data/{}imputed_data_egain.csv".format(i),
                       imputed_data_x_e,
                       delimiter=',',
                       fmt='%d')

            imputed_data_x, _ = normalization(imputed_data_x)
            imputed_data_x_e, _ = normalization(imputed_data_x_e)

            imputed_data_x_mean, _ = normalization(imputed_data_x_mean)
            imputed_data_x_knn, _ = normalization(imputed_data_x_knn)
            imputed_data_mf, _ = normalization(imputed_data_mf)
            imputed_data_mice, _ = normalization(imputed_data_mice)

            gan_score_mlp = clf_MLP(imputed_data_x, y, train_idx, test_idx)
            egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx)
            gan_mlp.append(gan_score_mlp)
            egan_mlp.append(egan_score_mlp)

            gan_score_dt = clf_DT(imputed_data_x, y, train_idx, test_idx)
            egan_score_dt = clf_DT(imputed_data_x_e, y, train_idx, test_idx)
            gan_dt.append(gan_score_dt)
            egan_dt.append(egan_score_dt)

            gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx)
            egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx)

            mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx)
            knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx)
            miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx)
            mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx)

            gan_lr.append(gan_score_lr)
            egan_lr.append(egan_score_lr)
            mean_lr.append(mean_score_lr)
            knn_lr.append(knn_score_lr)
            miss_lr.append(miss_score_lr)
            mice_lr.append(mice_score_lr)

            mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx,
                                     test_idx)
            knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx)
            miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx)
            mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx)
            mean_svc.append(mean_score_svc)
            knn_svc.append(knn_score_svc)
            miss_svc.append(miss_score_svc)
            mice_svc.append(mice_score_svc)

            gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx)
            egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx)
            gan_svc.append(gan_score_svc)
            egan_svc.append(egan_score_svc)

            mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx,
                                     test_idx)
            knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx)
            miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx)
            mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx)
            mean_sgd.append(mean_score_sgd)
            knn_sgd.append(knn_score_sgd)
            miss_sgd.append(miss_score_sgd)
            mice_sgd.append(mice_score_sgd)

            gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx)
            egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx)
            gan_sgd.append(gan_score_sgd)
            egan_sgd.append(egan_score_sgd)

            #gan_score_gau   = clf_GAU(imputed_data_x    , y, train_idx, test_idx)
            #egan_score_gau  = clf_GAU(imputed_data_x_e  , y, train_idx, test_idx)
            #gan_gau.append(gan_score_gau)
            #egan_gau.append(egan_score_gau)

        print()
        print("Datasets: ", data_name)
        #print(gan_rs,egain_rs, mice_rs,miss_rs)
        col3.append(
            f"{round(np.mean(egain_rs)*1,2)} ± {round(np.std(egain_rs),4)}")
        col3.append(
            f"{round(np.mean(gan_rs)*1,2)} ± {round(np.std(gan_rs),4)}")
        col3.append(
            f"{round(np.mean(mean_rmse)*1,2)} ± {round(np.std(mean_rmse),4)}")
        col3.append(
            f"{round(np.mean(knn_rmse)*1,2)} ± {round(np.std(knn_rmse),4)}")
        col3.append(
            f"{round(np.mean(mice_rmse)*1,2)} ± {round(np.std(mice_rmse),4)}")
        col3.append(
            f"{round(np.mean(miss_rmse)*1,2)} ± {round(np.std(miss_rmse),4)}")

        ##gan_rmspe, egan_rmspe, knn_rmspe , mean_rmspe, miss_rmspe, mice_rmspe
        col3.append(
            f"{round(np.mean(egan_rmspe)*1,2)} ± {round(np.std(egan_rmspe),4)}"
        )
        col3.append(
            f"{round(np.mean(gan_rmspe)*1,2)} ± {round(np.std(gan_rmspe),4)}")
        col3.append(
            f"{round(np.mean(mean_rmspe)*1,2)} ± {round(np.std(mean_rmspe),4)}"
        )
        col3.append(
            f"{round(np.mean(knn_rmspe)*1,2)} ± {round(np.std(knn_rmspe),4)}")
        col3.append(
            f"{round(np.mean(mice_rmspe)*1,2)} ± {round(np.std(mice_rmspe),4)}"
        )
        col3.append(
            f"{round(np.mean(miss_rmspe)*1,2)} ± {round(np.std(miss_rmspe),4)}"
        )
        col3.append([])

        col3.append(
            f"{round(np.mean(egan_mlp)*1,2)} ± {round(np.std(egan_mlp),4)}")
        col3.append(
            f"{round(np.mean(gan_mlp)*1,2)} ± { round(np.std(gan_mlp),4)}")
        col3.append(
            f"{round(np.mean(egan_dt)*1,2)} ± { round(np.std(egan_dt),4)}")
        col3.append(
            f"{round(np.mean(gan_dt)*1,2)} ± { round(np.std(gan_dt),4)}")

        col3.append(
            f"{round(np.mean(egan_lr)*1,2)} ± {round(np.std(egan_lr),4)}")
        col3.append(
            f"{round(np.mean(gan_lr)*1,2)} ± {round(np.std(gan_lr),4)}")
        col3.append(
            f"{round(np.mean(mean_lr)*1,2)} ± {round(np.std(mean_lr),4)}")
        col3.append(
            f"{round(np.mean(knn_lr)*1,2)} ± {round(np.std(knn_lr),4)}")
        col3.append(
            f"{round(np.mean(mice_lr)*1,2)} ± {round(np.std(mice_lr),4)}")
        col3.append(
            f"{round(np.mean(miss_lr)*1,2)} ± {round(np.std(miss_lr),4)}")

        col3.append(
            f"{round(np.mean(egan_svc)*1,2)} ± { round(np.std(egan_svc),4)}")
        col3.append(
            f"{round(np.mean(gan_svc)*1,2)} ± { round(np.std(gan_svc),4)}")
        col3.append(
            f"{round(np.mean(mean_svc)*1,2)} ± { round(np.std(mean_svc),4)}")
        col3.append(
            f"{round(np.mean(knn_svc)*1,2)} ± {round(np.std(knn_svc),4)}")
        col3.append(
            f"{round(np.mean(mice_svc)*1,2)} ± {round(np.std(mice_svc),4)}")
        col3.append(
            f"{round(np.mean(miss_svc)*1,2)} ± { round(np.std(miss_svc),4)}")

        col3.append(
            f"{round(np.mean(egan_sgd)*1,2)} ± { round(np.std(egan_sgd),4)}")
        col3.append(
            f"{round(np.mean(gan_sgd)*1,2)} ± { round(np.std(gan_sgd),4)}")
        col3.append(
            f"{round(np.mean(mean_sgd)*1,2)} ± { round(np.std(mean_sgd),4)}")
        col3.append(
            f"{round(np.mean(knn_sgd)*1,2)} ± {round(np.std(knn_sgd),4)}")
        col3.append(
            f"{round(np.mean(mice_sgd)*1,2)} ± { round(np.std(mice_sgd),4)}")
        col3.append(
            f"{round(np.mean(miss_sgd)*1,2)} ± {round(np.std(miss_sgd),4)}")
        '''
      print('RMSE  GAIN: {} ± {}'.format(round(np.mean(gan_rs)*1,2), round(np.std(gan_rs),4)))
      #print(gan_rs)
      print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs)*1,2), round(np.std(egain_rs),4)))
      #print(egain_rs)
      print('RMSE  MEAN: {} ± {}'.format(round(np.mean(mean_rmse)*1,2), round(np.std(mean_rmse),4)))
      #print(knn_rmse)
      print('RMSE   KNN: {} ± {}'.format(round(np.mean(knn_rmse)*1,2), round(np.std(knn_rmse),4)))
      #print(mice_rmse)
      print('RMSE  MICE: {} ± {}'.format(round(np.mean(mice_rmse)*1,2), round(np.std(mice_rmse),4)))
      #print(miss_rmse)
      print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse)*1,2), round(np.std(miss_rmse),4)))
      #print(miss_rmse)
      print()
      print('MLP   GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4)))
      print('MLP  EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4)))
      print()
      print('DT    GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4)))
      print('DT   EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4)))
      print()

      print('LR    GAIN: {} ± {}'.format(round(np.mean(gan_lr)*1,2), round(np.std(gan_lr),4)))
      #print(gan_lr)
      print('LR   EGAIN: {} ± {}'.format(round(np.mean(egan_lr)*1,2), round(np.std(egan_lr),4)))
      #print(egan_lr)
      print('LR    MEAN: {} ± {}'.format(round(np.mean(mean_lr)*1,2), round(np.std(mean_lr),4)))
      #print(mean_lr)
      print('LR     KNN: {} ± {}'.format(round(np.mean(knn_lr)*1,2), round(np.std(knn_lr),4)))
      #print(knn_lr)
      print('LR    MICE: {} ± {}'.format(round(np.mean(mice_lr)*1,2), round(np.std(mice_lr),4)))
      #print(mice_lr)
      print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr)*1,2), round(np.std(miss_lr),4)))
      #print(miss_lr)
      print()
      print('SVC   GAIN: {} ± {}'.format(round(np.mean(gan_svc)*1,2), round(np.std(gan_svc),4)))
      #print(gan_svc)
      print('SVC  EGAIN: {} ± {}'.format(round(np.mean(egan_svc)*1,2), round(np.std(egan_svc),4)))
      #print(egan_svc)
      print('SVC   MEAN: {} ± {}'.format(round(np.mean(mean_svc)*1,2), round(np.std(mean_svc),4)))
      #print(mean_svc)
      print('SVC    KNN: {} ± {}'.format(round(np.mean(knn_svc)*1,2), round(np.std(knn_svc),4)))
      #print(knn_svc)
      print('SVC   MICE: {} ± {}'.format(round(np.mean(mice_svc)*1,2), round(np.std(mice_svc),4)))
      #print(mice_svc)
      print('SVC   MISS: {} ± {}'.format(round(np.mean(miss_svc)*1,2), round(np.std(miss_svc),4)))
      #print(miss_svc)
      print()
      print('SGD   GAIN: {} ± {}'.format(round(np.mean(gan_sgd)*1,2), round(np.std(gan_sgd),4)))
      #print(gan_sgd)
      print('SGD  EGAIN: {} ± {}'.format(round(np.mean(egan_sgd)*1,2), round(np.std(egan_sgd),4)))
      #print(egan_sgd)
      print('SGD   MEAN: {} ± {}'.format(round(np.mean(mean_sgd)*1,2), round(np.std(mean_sgd),4)))
      #print(mean_sgd)
      print('SGD    KNN: {} ± {}'.format(round(np.mean(knn_sgd)*1,2), round(np.std(knn_sgd),4)))
      #print(knn_sgd)
      print('SGD   MICE: {} ± {}'.format(round(np.mean(mice_sgd)*1,2), round(np.std(mice_sgd),4)))
      #print(mice_sgd)
      print('SGD   MISS: {} ± {}'.format(round(np.mean(miss_sgd)*1,2), round(np.std(miss_sgd),4)))
      
      '''
        result.append(col3)
        my_array = np.asarray(result)
        #print(my_array)
        df_result = pd.DataFrame(my_array)
        df_result_tran = df_result.transpose()
        print(df_result_tran.to_string(index=False, header=False))
        #df_result_tran.to_csv("result.csv", encoding='utf-8', index=False, header=False)
        df_result_tran.to_csv("result.csv", index=False, header=False)
        df_result_tran.to_excel("result.xls",
                                encoding='utf-8',
                                index=False,
                                header=False)

        #print(miss_sgd)
        #print()
        #print('GAU   GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4)))
        #print('GAU  EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4)))

        # MissForest

        #print()
        #print('=== MissForest RMSE ===')
        #data = miss_data_x
        #imp_mean = MissForest(max_iter = 1)
        #miss_f = imp_mean.fit_transform(data)
        #miss_f = pd.DataFrame(imputed_train_df)
        #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m)
        #print('RMSE Performance: ' + str(np.round(rmse_MF, 6)))
        #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',',  fmt='%d')
        #print( 'Save results in Imputed_data_MF.csv')

        # MICE From Auto Impute
        #print()
        #print('=== MICE of Auto Impute RMSE ===')
        #data_mice = pd.DataFrame(miss_data_x)
        #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True,
        #      seed=None, strategy='interpolate', visit='default')
        #mice_out = mi.fit_transform(data_mice)
        #c = [list(x) for x in mice_out]
        #c1= c[0]
        #c2=c1[1]
        #c3=np.asarray(c2)
        #mice_x=c3
        #print('here :', mice_x, miss_f, miss_f.shape)
        #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m)
        #print('=== MICE of Auto Impute RMSE ===')
        #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6)))
        #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',',  fmt='%d')
        #print( 'Save results in Imputed_data_MICE.csv')

    return imputed_data_mf, rmse_mf
Esempio n. 7
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate
    random = args.seed
    time = args.time

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations,
        'time': args.time
    }
    # Load data and introduce missingness

    #ori_data_x, miss_data_x, data_m = data_loader2(data_name, miss_rate,random)


    gan_rs, egain_rs, mice_rs,miss_rs, gan_mlp, gan_dt, egan_mlp, egan_dt = [],[],[],[],[],[],[],[]
    gan_svc, egan_svc, gan_lr, egan_lr, gan_sgd, egan_sgd, gan_gau, egan_gau = [],[],[],[],[],[],[],[]
    knn_rmse, mean_rmse, miss_rmse, mice_rmse = [], [], [], []
    knn_lr, knn_svc, knn_sgd, mean_lr, mean_svc, mean_sgd =    [],[],[],[],[],[]
    miss_lr, miss_svc, miss_sgd, mice_lr, mice_svc, mice_sgd = [],[],[],[],[],[]

    for i in range(time):
        # Load data and introduce missingness
        ori_data_x, miss_data_x, data_m, y = data_loader3(
            data_name, miss_rate, i)
        train_idx, test_idx = train_test_split(range(len(y)),
                                               test_size=0.3,
                                               stratify=y,
                                               random_state=42)
        miss_data_x2 = miss_data_x * 10000
        if i % 5 == 0:
            print('=== Working on {}/{} ==='.format(i, time))

        # Impute missing data
        imputed_data_x1 = gain(miss_data_x2, gain_parameters)
        imputed_data_x_e1 = egain(miss_data_x2, gain_parameters)
        imputed_data_x = imputed_data_x1 * 1 / 10000
        imputed_data_x_e = imputed_data_x_e1 * 1 / 10000

        imp_MEAN = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x)
        #imputed_data_x_mean = imp_MEAN.fit_transform(miss_data_x2)  *1/10000

        imp_KNN = KNNImputer(missing_values=np.nan)
        imputed_data_x_knn = imp_KNN.fit_transform(miss_data_x)  # *1/10000

        imp_mf = IterativeImputer(estimator=DecisionTreeRegressor(),
                                  max_iter=3)  #20
        imputed_data_mf = imp_mf.fit_transform(miss_data_x)  #*1/10000

        imp_mice = IterativeImputer(estimator=BayesianRidge(), max_iter=3)  #20
        imputed_data_mice = imp_mice.fit_transform(miss_data_x)  #*1/10000

        # Report the RMSE performance
        rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
        rmse_e = rmse_loss(ori_data_x, imputed_data_x_e, data_m)
        rmse_mean = rmse_loss(ori_data_x, imputed_data_x_mean, data_m)
        rmse_knn = rmse_loss(ori_data_x, imputed_data_x_knn, data_m)
        rmse_mf = rmse_loss(ori_data_x, imputed_data_mf, data_m)
        rmse_mice = rmse_loss(ori_data_x, imputed_data_mice, data_m)

        gan_rs.append(rmse)
        egain_rs.append(rmse_e)

        mean_rmse.append(rmse_mean)
        knn_rmse.append(rmse_knn)
        mice_rmse.append(rmse_mice)
        miss_rmse.append(rmse_mf)

        mi_data = miss_data_x.astype(float)
        no, dim = imputed_data_mice.shape
        miss_data = np.reshape(mi_data, (no, dim))
        np.savetxt("data/missing_data.csv",
                   mi_data,
                   delimiter=',',
                   fmt='%1.2f')
        np.savetxt("data/imputed_data_gain.csv",
                   imputed_data_x,
                   delimiter=',',
                   fmt='%d')
        np.savetxt("data/imputed_data_egain.csv",
                   imputed_data_x_e,
                   delimiter=',',
                   fmt='%d')

        imputed_data_x, _ = normalization(imputed_data_x)
        imputed_data_x_e, _ = normalization(imputed_data_x_e)

        imputed_data_x_mean, _ = normalization(imputed_data_x_mean)
        imputed_data_x_knn, _ = normalization(imputed_data_x_knn)
        imputed_data_mf, _ = normalization(imputed_data_mf)
        imputed_data_mice, _ = normalization(imputed_data_mice)

        #gan_score_mlp  = clf_MLP(imputed_data_x  , y, train_idx, test_idx)
        #egan_score_mlp = clf_MLP(imputed_data_x_e, y, train_idx, test_idx)
        #gan_mlp.append(gan_score_mlp)
        #egan_mlp.append(egan_score_mlp)

        #gan_score_dt   = clf_DT(imputed_data_x    , y, train_idx, test_idx)
        #egan_score_dt  = clf_DT(imputed_data_x_e  , y, train_idx, test_idx)
        #gan_dt.append(gan_score_dt)
        #egan_dt.append(egan_score_dt)

        gan_score_lr = clf_LR(imputed_data_x, y, train_idx, test_idx)
        egan_score_lr = clf_LR(imputed_data_x_e, y, train_idx, test_idx)

        mean_score_lr = clf_LR(imputed_data_x_mean, y, train_idx, test_idx)
        knn_score_lr = clf_LR(imputed_data_x_knn, y, train_idx, test_idx)
        miss_score_lr = clf_LR(imputed_data_mf, y, train_idx, test_idx)
        mice_score_lr = clf_LR(imputed_data_mice, y, train_idx, test_idx)

        gan_lr.append(gan_score_lr)
        egan_lr.append(egan_score_lr)
        mean_lr.append(mean_score_lr)
        knn_lr.append(knn_score_lr)
        miss_lr.append(miss_score_lr)
        mice_lr.append(mice_score_lr)

        mean_score_svc = clf_SVC(imputed_data_x_mean, y, train_idx, test_idx)
        knn_score_svc = clf_SVC(imputed_data_x_knn, y, train_idx, test_idx)
        miss_score_svc = clf_SVC(imputed_data_mf, y, train_idx, test_idx)
        mice_score_svc = clf_SVC(imputed_data_mice, y, train_idx, test_idx)
        mean_svc.append(mean_score_svc)
        knn_svc.append(knn_score_svc)
        miss_svc.append(miss_score_svc)
        mice_svc.append(mice_score_svc)

        gan_score_svc = clf_SVC(imputed_data_x, y, train_idx, test_idx)
        egan_score_svc = clf_SVC(imputed_data_x_e, y, train_idx, test_idx)
        gan_svc.append(gan_score_svc)
        egan_svc.append(egan_score_svc)

        mean_score_sgd = clf_SGD(imputed_data_x_mean, y, train_idx, test_idx)
        knn_score_sgd = clf_SGD(imputed_data_x_knn, y, train_idx, test_idx)
        miss_score_sgd = clf_SGD(imputed_data_mf, y, train_idx, test_idx)
        mice_score_sgd = clf_SGD(imputed_data_mice, y, train_idx, test_idx)
        mean_sgd.append(mean_score_sgd)
        knn_sgd.append(knn_score_sgd)
        miss_sgd.append(miss_score_sgd)
        mice_sgd.append(mice_score_sgd)

        gan_score_sgd = clf_SGD(imputed_data_x, y, train_idx, test_idx)
        egan_score_sgd = clf_SGD(imputed_data_x_e, y, train_idx, test_idx)
        gan_sgd.append(gan_score_sgd)
        egan_sgd.append(egan_score_sgd)

        #gan_score_gau   = clf_GAU(imputed_data_x    , y, train_idx, test_idx)
        #egan_score_gau  = clf_GAU(imputed_data_x_e  , y, train_idx, test_idx)
        #gan_gau.append(gan_score_gau)
        #egan_gau.append(egan_score_gau)

    print()
    print("Datasets: ", data_name)
    #print(gan_rs,egain_rs, mice_rs,miss_rs)
    print('RMSE  GAIN: {} ± {}'.format(round(np.mean(gan_rs) * 1, 2),
                                       round(np.std(gan_rs), 4)))
    print('RMSE EGAIN: {} ± {}'.format(round(np.mean(egain_rs) * 1, 2),
                                       round(np.std(egain_rs), 4)))
    print('RMSE  MEAN: {} ± {}'.format(round(np.mean(mean_rmse) * 1, 2),
                                       round(np.std(mean_rmse), 4)))
    print('RMSE   KNN: {} ± {}'.format(round(np.mean(knn_rmse) * 1, 2),
                                       round(np.std(knn_rmse), 4)))
    print('RMSE  MICE: {} ± {}'.format(round(np.mean(mice_rmse) * 1, 2),
                                       round(np.std(mice_rmse), 4)))
    print('RMSE MFORE: {} ± {}'.format(round(np.mean(miss_rmse) * 1, 2),
                                       round(np.std(miss_rmse), 4)))
    #print()
    #print('MLP   GAIN: {} ± {}'.format(round(np.mean(gan_mlp)*1,2), round(np.std(gan_mlp),4)))
    #print('MLP  EGAIN: {} ± {}'.format(round(np.mean(egan_mlp)*1,2), round(np.std(egan_mlp),4)))
    #print()
    #print('DT    GAIN: {} ± {}'.format(round(np.mean(gan_dt)*1,2), round(np.std(gan_dt),4)))
    #print('DT   EGAIN: {} ± {}'.format(round(np.mean(egan_dt)*1,2), round(np.std(egan_dt),4)))
    print()
    print('LR    GAIN: {} ± {}'.format(round(np.mean(gan_lr) * 1, 2),
                                       round(np.std(gan_lr), 4)))
    print('LR   EGAIN: {} ± {}'.format(round(np.mean(egan_lr) * 1, 2),
                                       round(np.std(egan_lr), 4)))
    print('LR    MEAN: {} ± {}'.format(round(np.mean(mean_lr) * 1, 2),
                                       round(np.std(mean_lr), 4)))
    print('LR     KNN: {} ± {}'.format(round(np.mean(knn_lr) * 1, 2),
                                       round(np.std(knn_lr), 4)))
    print('LR    MICE: {} ± {}'.format(round(np.mean(mice_lr) * 1, 2),
                                       round(np.std(mice_lr), 4)))
    print('LR MISSFOR: {} ± {}'.format(round(np.mean(miss_lr) * 1, 2),
                                       round(np.std(miss_lr), 4)))
    print()
    print('SVC   GAIN: {} ± {}'.format(round(np.mean(gan_svc) * 1, 2),
                                       round(np.std(gan_svc), 4)))
    print('SVC  EGAIN: {} ± {}'.format(round(np.mean(egan_svc) * 1, 2),
                                       round(np.std(egan_svc), 4)))
    print('SVC   MEAN: {} ± {}'.format(round(np.mean(mean_svc) * 1, 2),
                                       round(np.std(mean_svc), 4)))
    print('SVC    KNN: {} ± {}'.format(round(np.mean(knn_svc) * 1, 2),
                                       round(np.std(knn_svc), 4)))
    print('SVC   MICE: {} ± {}'.format(round(np.mean(mice_svc) * 1, 2),
                                       round(np.std(mice_svc), 4)))
    print('SVC   MISS: {} ± {}'.format(round(np.mean(miss_svc) * 1, 2),
                                       round(np.std(miss_svc), 4)))
    print()
    print('SGD   GAIN: {} ± {}'.format(round(np.mean(gan_sgd) * 1, 2),
                                       round(np.std(gan_sgd), 4)))
    print('SGD  EGAIN: {} ± {}'.format(round(np.mean(egan_sgd) * 1, 2),
                                       round(np.std(egan_sgd), 4)))
    print('SGD   MEAN: {} ± {}'.format(round(np.mean(mean_sgd) * 1, 2),
                                       round(np.std(mean_sgd), 4)))
    print('SGD    KNN: {} ± {}'.format(round(np.mean(knn_sgd) * 1, 2),
                                       round(np.std(knn_sgd), 4)))
    print('SGD   MICE: {} ± {}'.format(round(np.mean(mice_sgd) * 1, 2),
                                       round(np.std(mice_sgd), 4)))
    print('SGD   MISS: {} ± {}'.format(round(np.mean(miss_sgd) * 1, 2),
                                       round(np.std(miss_sgd), 4)))
    #print()
    #print('GAU   GAIN: {} ± {}'.format(round(np.mean(gan_gau)*1,2), round(np.std(gan_dt),4)))
    #print('GAU  EGAIN: {} ± {}'.format(round(np.mean(egan_gau)*1,2), round(np.std(egan_dt),4)))

    # MissForest

    #print()
    #print('=== MissForest RMSE ===')
    #data = miss_data_x
    #imp_mean = MissForest(max_iter = 1)
    #miss_f = imp_mean.fit_transform(data)
    #miss_f = pd.DataFrame(imputed_train_df)
    #rmse_MF = rmse_loss (ori_data_x, miss_f, data_m)
    #print('RMSE Performance: ' + str(np.round(rmse_MF, 6)))
    #np.savetxt("data/imputed_data_MF.csv",miss_f, delimiter=',',  fmt='%d')
    #print( 'Save results in Imputed_data_MF.csv')

    # MICE From Auto Impute
    #print()
    #print('=== MICE of Auto Impute RMSE ===')
    #data_mice = pd.DataFrame(miss_data_x)
    #mi = MiceImputer(k=1, imp_kwgs=None, n=1, predictors='all', return_list=True,
    #      seed=None, strategy='interpolate', visit='default')
    #mice_out = mi.fit_transform(data_mice)
    #c = [list(x) for x in mice_out]
    #c1= c[0]
    #c2=c1[1]
    #c3=np.asarray(c2)
    #mice_x=c3
    #print('here :', mice_x, miss_f, miss_f.shape)
    #rmse_MICE = rmse_loss (ori_data_x, mice_x, data_m)
    #print('=== MICE of Auto Impute RMSE ===')
    #print('RMSE Performance: ' + str(np.round(rmse_MICE, 6)))
    #np.savetxt("data/imputed_data_MICE.csv",mice_x, delimiter=',',  fmt='%d')
    #print( 'Save results in Imputed_data_MICE.csv')

    return imputed_data_mf, rmse_mf
Esempio n. 8
0
def main():
    data_names = ['letter', 'spam']
    # data_names = ['breasttissue','glass', 'thyroid']
    # data with continuous feature and not originally missing

    #data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel',
    #              'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst',
    #              'thyroid','vehicle','vertebral','wine','yeast']
    print(len(data_names))
    miss_rate = 0.2
    batch_size = 64
    alpha = 100
    iterations = 1000
    n_times = 3
    wb = xlwt.Workbook()
    sh_rmse = wb.add_sheet("GAIN_rmse")
    # sh_acc = wb.add_sheet("EGAIN_acc")
    sh_acc_dct = wb.add_sheet("GAIN_acc_dct")
    sh_acc_knn = wb.add_sheet("GAIN_acc_knn")
    sh_acc_nb = wb.add_sheet("GAIN_acc_nb")
    sh_acc_lr = wb.add_sheet("GAIN_acc_lr")

    for k in range(len(data_names)):

        data_name = data_names[k]
        gain_parameters = {
            'batch_size': batch_size,
            'alpha': alpha,
            'iterations': iterations
        }
        print("Dataset: ", data_name)
        rmse = []
        # acc_dct = []
        # acc_knn = []
        # acc_nb = []
        ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate)
        sh_rmse.write(0, k, data_name)
        sh_acc_dct.write(0, k, data_name)
        sh_acc_knn.write(0, k, data_name)
        sh_acc_nb.write(0, k, data_name)
        sh_acc_lr.write(0, k, data_name)
        # sh_acc.write(0, 0, 'dct')
        # sh_acc.write(0, 1, 'knn')
        # sh_acc.write(0, 2, 'nb')
        for i in range(n_times):

            # Impute missing data
            imputed_data_x = gain(miss_data_x, gain_parameters)
            imputed_data_x, _ = normalization(imputed_data_x)

            # Calculate rmse
            rmse.append(rmse_loss(ori_data_x, imputed_data_x, m))

            print('{:2d}/{:2d}'.format(i + 1, n_times), end=':')
            print('RMSE = ' + str(np.round(rmse[-1], 4)))
            sh_rmse.write(i + 1, k, str(np.round(rmse[-1], 4)))
            if data_name in ['letter', 'spam']:
                continue
            scf = StratifiedShuffleSplit(n_splits=10)
            score_dct = cross_val_score(DecisionTreeClassifier(),
                                        imputed_data_x,
                                        y,
                                        cv=scf,
                                        scoring='accuracy')
            print(score_dct)
            # acc_dct.extend(score_dct)
            sh_acc_dct.write(i + 1, k, str(np.round(np.mean(score_dct), 4)))
            # for j in range(len(score_dct)):
            # sh_acc.write(i * 5 + j + 1, 0, str(np.round(score_dct[j], 4)))
            score_knn = cross_val_score(KNeighborsClassifier(),
                                        imputed_data_x,
                                        y,
                                        cv=scf,
                                        scoring='accuracy')
            print(score_knn)
            # acc_knn.extend(score_knn)
            sh_acc_knn.write(i + 1, k, str(np.round(np.mean(score_knn), 4)))
            # for j in range(len(score_knn)):
            # sh_acc.write(i * 5 + j + 1, 1, str(np.round(score_knn[j], 4)))
            score_nb = cross_val_score(GaussianNB(),
                                       imputed_data_x,
                                       y,
                                       cv=scf,
                                       scoring='accuracy')
            print(score_nb)
            # acc_nb.extend(score_nb)
            sh_acc_nb.write(i + 1, k, str(np.round(np.mean(score_nb), 4)))
            # for j in range(len(score_nb)):
            # sh_acc.write(i * 5 + j + 1, 2, str(np.round(score_nb[j], 4)))
            score_lr = cross_val_score(LogisticRegression(max_iter=1000),
                                       imputed_data_x,
                                       y,
                                       cv=scf,
                                       scoring='accuracy')
            print(score_lr)
            # acc_nb.extend(score_nb)
            sh_acc_lr.write(i + 1, k, str(np.round(np.mean(score_lr), 4)))
        # rmse = np.array(rmse)
        # acc_dct = np.array(acc_dct)
        # acc_knn = np.array(acc_knn)
        # acc_nb = np.array(acc_nb)
        # print("RMSE mean = {:.4f}; variance = {:.4f} ".format(np.mean(rmse), np.std(rmse)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_dct), np.std(acc_dct)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_knn), np.std(acc_knn)))
        # print("Acc mean = {:.4f}; variance = {:.4f} ".format(np.mean(acc_nb), np.std(acc_nb)))
        print("---------------------------")
    wb.save('GAIN_results_15.xls')
Esempio n. 9
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m, data_y = data_loader(data_name, miss_rate)

    imputed_data_x = gain(miss_data_x, gain_parameters)

    #pd.DataFrame(data_y, imputed_data_x, axis = 1)

    # Step- craete data_m using testdata
    # Step - combine train and missing_test_data
    # Step - retrun total missing and original data_m
    # Step - while calculating RMSE
    # use original as test_original
    # fetch testing imputed datset 934 to last
    # data_m as missing_test_data

    if data_name == 'vals_test_df':
        imputed_data_x = imputed_data_x[range(918, 1311), :]
    elif data_name == 'vals_test_df_test_type1':
        imputed_data_x = imputed_data_x[range(495, 1311), :]
    elif data_name == 'vals_test_df_test_type2':
        imputed_data_x = imputed_data_x[range(816, 1311), :]
    else:
        imputed_data_x = imputed_data_x

    imputed_data_x_df = pd.DataFrame(imputed_data_x)
    data_y_df = pd.DataFrame(data_y)
    imputed_data_df = pd.concat([data_y_df, imputed_data_x_df],
                                ignore_index=True,
                                axis=1)
    imputed_data_df.to_csv("GAN_imputated_catalogueData1.csv", index=False)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)

    print()
    print('RMSE Performance: ' + str(np.round(rmse, 4)))

    return imputed_data_x, rmse
Esempio n. 10
0
def main():
    data_names = ['letter', 'spam']
    # data_names = ['breasttissue','glass', 'thyroid']
    # data with continuous feature and not originally missing

    # data_names = ['balance','banknote','blood','breasttissue', 'climate','connectionistvowel',
    #               'ecoli','glass','hillvalley','ionosphere', 'parkinsons','planning','seedst',
    #               'thyroid','vehicle','vertebral','wine','yeast']
    print(len(data_names))
    miss_rate = 0.2
    batch_size = 64
    alpha = 100
    iterations = 1000
    n_times = 30

    wb_gain = xlwt.Workbook()
    sh_rmse_gain = wb_gain.add_sheet("GAIN_rmse")
    sh_acc_dct_gain = wb_gain.add_sheet("GAIN_acc_dct")
    sh_acc_knn_gain = wb_gain.add_sheet("GAIN_acc_knn")
    sh_acc_nb_gain = wb_gain.add_sheet("GAIN_acc_nb")
    sh_acc_lr_gain = wb_gain.add_sheet("GAIN_acc_lr")

    wb_egain = xlwt.Workbook()
    sh_rmse_egain = wb_egain.add_sheet("EGAIN_rmse")
    sh_acc_dct_egain = wb_egain.add_sheet("EGAIN_acc_dct")
    sh_acc_knn_egain = wb_egain.add_sheet("EGAIN_acc_knn")
    sh_acc_nb_egain = wb_egain.add_sheet("EGAIN_acc_nb")
    sh_acc_lr_egain = wb_egain.add_sheet("EGAIN_acc_lr")

    wb_mean = xlwt.Workbook()
    sh_rmse_mean = wb_mean.add_sheet("MEAN_rmse")
    sh_acc_dct_mean = wb_mean.add_sheet("MEAN_acc_dct")
    sh_acc_knn_mean = wb_mean.add_sheet("MEAN_acc_knn")
    sh_acc_nb_mean = wb_mean.add_sheet("MEAN_acc_nb")
    sh_acc_lr_mean = wb_mean.add_sheet("MEAN_acc_lr")

    wb_knn = xlwt.Workbook()
    sh_rmse_knn = wb_knn.add_sheet("KNN_rmse")
    sh_acc_dct_knn = wb_knn.add_sheet("KNN_acc_dct")
    sh_acc_knn_knn = wb_knn.add_sheet("KNN_acc_knn")
    sh_acc_nb_knn = wb_knn.add_sheet("KNN_acc_nb")
    sh_acc_lr_knn = wb_knn.add_sheet("KNN_acc_lr")

    for k in range(len(data_names)):

        data_name = data_names[k]
        gain_parameters = {
            'batch_size': batch_size,
            'alpha': alpha,
            'iterations': iterations
        }
        ori_data_x, y, miss_data_x, m = data_loader(data_name, miss_rate)

        print("Dataset: ", data_name)

        ###########################Mean imputation#################################
        print('Mean imputation')
        sh_rmse_mean.write(0, k, data_name)
        sh_acc_dct_mean.write(0, k, data_name)
        sh_acc_knn_mean.write(0, k, data_name)
        sh_acc_nb_mean.write(0, k, data_name)
        sh_acc_lr_mean.write(0, k, data_name)

        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputed_data_x = imp.fit_transform(miss_data_x)

        sh_rmse_mean.write(
            1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4))

        # Normalize data and classification
        # imputed_data_x, _ = normalization(imputed_data_x)
        #
        # scf = StratifiedShuffleSplit(n_splits=10)
        # # DCT classifier
        # score_dct_mean = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_dct_mean.write(1, k, np.round(np.mean(score_dct_mean), 4))
        # # KNN classifier
        # score_knn_mean = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_knn_mean.write(1, k, np.round(np.mean(score_knn_mean), 4))
        # # NB classifier
        # score_nb_mean = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_nb_mean.write(1, k, np.round(np.mean(score_nb_mean), 4))
        # # LR classifier
        # score_lr_mean = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
        #                                 scoring='accuracy')
        # sh_acc_lr_mean.write(1, k, np.round(np.mean(score_lr_mean), 4))

        ###########################KNN imputation#################################
        print('KNN imputation')
        sh_rmse_knn.write(0, k, data_name)
        sh_acc_dct_knn.write(0, k, data_name)
        sh_acc_knn_knn.write(0, k, data_name)
        sh_acc_nb_knn.write(0, k, data_name)
        sh_acc_lr_knn.write(0, k, data_name)

        imp = KNNImputer(missing_values=np.nan)
        imputed_data_x = imp.fit_transform(miss_data_x)

        sh_rmse_knn.write(
            1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m), 4))

        # # Normalize data and classification
        # imputed_data_x, _ = normalization(imputed_data_x)
        #
        # scf = StratifiedShuffleSplit(n_splits=10)
        # # DCT classifier
        # score_dct_knn = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_dct_knn.write(1, k, np.round(np.mean(score_dct_knn), 4))
        # # KNN classifier
        # score_knn_knn = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_knn_knn.write(1, k, np.round(np.mean(score_knn_knn), 4))
        # # NB classifier
        # score_nb_knn = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
        # sh_acc_nb_knn.write(1, k, np.round(np.mean(score_nb_knn), 4))
        # # LR classifier
        # score_lr_knn = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
        #                                 scoring='accuracy')
        # sh_acc_lr_knn.write(1, k, np.round(np.mean(score_lr_knn), 4))

        ###########################GAIN imputation#################################
        print('GAIN imputation')
        sh_rmse_gain.write(0, k, data_name)
        sh_acc_dct_gain.write(0, k, data_name)
        sh_acc_knn_gain.write(0, k, data_name)
        sh_acc_nb_gain.write(0, k, data_name)
        sh_acc_lr_gain.write(0, k, data_name)
        for i in tqdm(range(n_times)):
            # Impute missing data
            imputed_data_x = gain(miss_data_x, gain_parameters)
            sh_rmse_gain.write(
                i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m),
                                   4))

            # #Normalize data and classification
            # imputed_data_x,_ = normalization(imputed_data_x)
            #
            # scf = StratifiedShuffleSplit(n_splits=10)
            # #DCT classifier
            # score_dct_gain = cross_val_score(DecisionTreeClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_dct_gain.write(i+1, k, np.round(np.mean(score_dct_gain), 4))
            # #KNN classifier
            # score_knn_gain = cross_val_score(KNeighborsClassifier(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_knn_gain.write(i+1, k, np.round(np.mean(score_knn_gain), 4))
            # #NB classifier
            # score_nb_gain = cross_val_score(GaussianNB(),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_nb_gain.write(i+1, k, np.round(np.mean(score_nb_gain), 4))
            # #LR classifier
            # score_lr_gain = cross_val_score(LogisticRegression(max_iter=1000),imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_lr_gain.write(i+1, k, np.round(np.mean(score_lr_gain), 4))

        ###########################EGAIN imputation#################################
        print('EGAIN imputation')
        sh_rmse_egain.write(0, k, data_name)
        sh_acc_dct_egain.write(0, k, data_name)
        sh_acc_knn_egain.write(0, k, data_name)
        sh_acc_nb_egain.write(0, k, data_name)
        sh_acc_lr_egain.write(0, k, data_name)

        for i in tqdm(range(n_times)):

            imputed_data_x = Egain(miss_data_x, gain_parameters)
            sh_rmse_egain.write(
                i + 1, k, np.round(rmse_loss(ori_data_x, imputed_data_x, m),
                                   4))

            # Normalize data and classification
            # imputed_data_x, _ = normalization(imputed_data_x)
            #
            # scf = StratifiedShuffleSplit(n_splits=10)
            # # DCT classifier
            # score_dct_egain = cross_val_score(DecisionTreeClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_dct_egain.write(i + 1, k, np.round(np.mean(score_dct_egain), 4))
            # # KNN classifier
            # score_knn_egain = cross_val_score(KNeighborsClassifier(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_knn_egain.write(i + 1, k, np.round(np.mean(score_knn_egain), 4))
            # # NB classifier
            # score_nb_egain = cross_val_score(GaussianNB(), imputed_data_x, y, cv=scf, scoring='accuracy')
            # sh_acc_nb_egain.write(i + 1, k, np.round(np.mean(score_nb_egain), 4))
            # # LR classifier
            # score_lr_egain = cross_val_score(LogisticRegression(max_iter=1000), imputed_data_x, y, cv=scf,
            #                                 scoring='accuracy')
            # sh_acc_lr_egain.write(i + 1, k, np.round(np.mean(score_lr_egain), 4))

    wb_gain.save('GAIN_test.xls')
    wb_egain.save('EGAIN_test.xls')
    wb_mean.save('MEAN_test.xls')
    wb_knn.save('KNN_test.xls')
Esempio n. 11
0
def main(args):
    '''Main function for UCI letter and spam datasets.
  
  Args:
    - data_name: letter or spam
    - miss_rate: probability of missing components
    - batch:size: batch size
    - hint_rate: hint rate
    - alpha: hyperparameter
    - iterations: iterations
    
  Returns:
    - imputed_data_x: imputed data
    - rmse: Root Mean Squared Error
  '''

    data_name = args.data_name
    miss_rate = args.miss_rate

    gain_parameters = {
        'batch_size': args.batch_size,
        'hint_rate': args.hint_rate,
        'alpha': args.alpha,
        'iterations': args.iterations
    }

    # Load data and introduce missingness
    ori_data_x, miss_data_x, data_m = data_loader(data_name, miss_rate)

    # Impute missing data
    imputed_data_x = gain(miss_data_x, gain_parameters)

    # Report the RMSE performance
    rmse = rmse_loss(ori_data_x, imputed_data_x, data_m)
    print()
    mi_data = miss_data_x.astype(float)
    no, dim = imputed_data_x.shape
    miss_data = np.reshape(mi_data, (no, dim))
    np.savetxt("data/missing_data.csv", mi_data, delimiter=',', fmt='%1.2f')
    print('Shape of miss data: ', miss_data.shape)
    print('Save results in missing_data.csv')

    print()
    print('=== GAIN RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse, 6)))
    #print('Kích thước của file đầu ra: ', imputed_data_x.shape)
    np.savetxt("data/imputed_data.csv",
               imputed_data_x,
               delimiter=',',
               fmt='%d')
    print('Save results in Imputed_data.csv')

    # MissForest

    print()
    print('=== MissForest RMSE ===')
    data = miss_data_x
    imp_mean = MissForest(max_iter=5)
    miss_f = imp_mean.fit_transform(data)
    #miss_f = pd.DataFrame(imputed_train_df)
    rmse_MF = rmse_loss(ori_data_x, miss_f, data_m)
    print('RMSE Performance: ' + str(np.round(rmse_MF, 6)))
    np.savetxt("data/imputed_data_MF.csv", miss_f, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MF.csv')

    # MICE From Auto Impute
    print()
    print('=== MICE of Auto Impute RMSE ===')
    data_mice = pd.DataFrame(miss_data_x)
    mi = MiceImputer(k=1,
                     imp_kwgs=None,
                     n=1,
                     predictors='all',
                     return_list=True,
                     seed=None,
                     strategy='default predictive',
                     visit='default')
    mice_out = mi.fit_transform(data_mice)
    c = [list(x) for x in mice_out]
    c1 = c[0]
    c2 = c1[1]
    c3 = np.asarray(c2)
    mice_x = c3
    #print('here :', mice_x, miss_f, miss_f.shape)
    rmse_MICE = rmse_loss(ori_data_x, mice_x, data_m)
    print('=== MICE of Auto Impute RMSE ===')
    print('RMSE Performance: ' + str(np.round(rmse_MICE, 6)))
    np.savetxt("data/imputed_data_MICE.csv", mice_x, delimiter=',', fmt='%d')
    print('Save results in Imputed_data_MICE.csv')

    return imputed_data_x, rmse
Esempio n. 12
0
# np.loadtxt(os.path.join(os.path.join(os.getcwd(), '[10] data/' + data_name + '_miss' + '.csv'), delimiter=",", skiprows=1)


rmse_gain = rmse


#%%
# Mean imputation
from sklearn.impute import SimpleImputer

med_imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
med_imputer = med_imputer.fit(miss_data_x)
imputed_data_med = med_imputer.transform(miss_data_x)

# Report the RMSE performance
rmse_med = rmse_loss(ori_data_x, imputed_data_med, data_m)

print()
print('RMSE Performance: ' + str(np.round(rmse_med, 4)))


#%%
# EM imputation
import impyute as impy

data_missing = pd.DataFrame(miss_data_x)
em_imputed = impy.em(miss_data_x)

rmse_em = rmse_loss(ori_data_x, em_imputed, data_m)

print()