Beispiel #1
0
def randu(bound=(0, 10), shape=(5, 5), missingness="mcar", thr=0.2, dtype="int"):
    """ Return randomly generated dataset of numbers with uniformly
    distributed values between bound[0] and bound[1]

    Parameters
    ----------
    bound:tuple (start,stop)
        Determines the range of values in the matrix. Index 0 for start
        value and index 1 for stop value. Start is inclusive, stop is
        exclusive.
    shape:tuple(optional)
        Size of the randomly generated data
    missingness: ('mcar', 'mar', 'mnar')
        Type of missigness you want in your dataset
    th: float between [0,1]
        Percentage of missing data in generated data
    dtype: ('int','float')
        Type of data

    Returns
    -------
    numpy.ndarray
    """
    if dtype == "int":
        data = np.random.randint(bound[0], bound[1], size=shape).astype(float)
    elif dtype == "float":
        data = np.random.uniform(bound[0], bound[1], size=shape)
    corruptor = Corruptor(data, thr=thr)
    raw_data = getattr(corruptor, missingness)()
    return raw_data
Beispiel #2
0
def randn(theta=(0, 1), shape=(5, 5), missingness="mcar", thr=0.2, dtype="float"):
    """ Return randomly generated dataset of numbers with normally
    distributed values with given and sigma.

    Parameters
    ----------
    theta: tuple (mu, sigma)
        Determines the range of values in the matrix
    shape:tuple(optional)
        Size of the randomly generated data
    missingness: ('mcar', 'mar', 'mnar')
        Type of missigness you want in your dataset
    th: float between [0,1]
        Percentage of missing data in generated data
    dtype: ('int','float')
        Type of data

    Returns
    -------
    numpy.ndarray
    """
    mean, sigma = theta
    data = np.random.normal(mean, sigma, size=shape)
    if dtype == "int":
        data = np.round(data)
    elif dtype == "float":
        pass
    corruptor = Corruptor(data, thr=thr)
    raw_data = getattr(corruptor, missingness)()
    return raw_data
Beispiel #3
0
def impute_parameter_adjustment(method, param_grid, impute_radio, x_init,
                                y_init, reference_x, reference_y):
    model = joblib.load('..\\models\\vote_model_hard.joblib')
    markers = ['o', '*', '1', 's', '2']
    I = 20
    for radio, marker in zip(impute_radio, markers):
        acc_1 = {i: 0 for i in param_grid}
        acc_2 = {i: 0 for i in param_grid}
        for m in range(I):
            corruptor = Corruptor(x_init, radio)
            x_miss = getattr(corruptor, "mcar")()
            for n in param_grid:
                if method == 'knn':
                    x_impute = fancyimpute.KNN(k=n).fit_transform(
                        np.vstack(
                            (x_miss, reference_x)))[range(x_init.shape[0])]
                if method == 'mice':
                    data_impute_list = []
                    for i in range(n):
                        imputer = fancyimpute.IterativeImputer(
                            n_iter=13, sample_posterior=True, random_state=i)
                        data_impute_list.append(
                            imputer.fit_transform(
                                np.vstack(
                                    (x_miss,
                                     reference_x)))[range(x_init.shape[0])])
                    x_impute = np.mean(data_impute_list, 0)
                    print(radio, m, n)
                if method == 'em':
                    x_impute = em(np.vstack((x_miss, reference_x)),
                                  loops=n)[range(x_init.shape[0])]
                if method == 'som':
                    x_impute = impute_SOM(x_miss, n)[range(x_init.shape[0])]
                y_pred1 = model.predict(x_impute)
                y_pred2 = model.predict(x_init)
                acc_1[n] += 1 - accuracy_score(y_pred1, y_pred2)
                acc_2[n] += 1 - accuracy_score(y_pred1, y_init)
        acc_1 = {i: (j / I) for i, j in acc_1.items()}
        acc_2 = {i: (j / I) for i, j in acc_2.items()}
        plt.subplot(121)
        plt.plot(acc_1.keys(),
                 acc_1.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and prediction')
        plt.subplot(122)
        plt.plot(acc_2.keys(),
                 acc_2.values(),
                 marker=marker,
                 label='%.1f%%' % (radio * 100))
        plt.xlabel('K')
        plt.ylabel('CER between imputation and real label')
        plt.legend(loc=0, bbox_to_anchor=(0.3, -0.05), ncol=5)
    plt.show()
Beispiel #4
0
def randc(nlevels=5, shape=(5, 5), missingness="mcar", thr=0.2):
    """ Return randomly generated dataset with uniformly distributed categorical data (alphabetic character)

    Parameters
    ----------
    nlevels: int
        Specify the number of different categories in the dataset
    shape: tuple(optional)
        Size of the randomly generated data
    missingness: string in ('mcar', 'mar', 'mnar')
        Type of missingness you want in your dataset
    thr: float between [0,1]
        Percentage of missing data in generated data

    Returns
    -------
    numpy.ndarray
    """
    if shape[0] * shape[1] < nlevels:
        raise error.BadInputError(
            "nlevel exceeds the size of desired dataset. Please decrease the nlevel or increase the shape"
        )

    length = len(string.ascii_lowercase)
    n_fold = int(math.floor(math.log(nlevels, length)))
    cat_pool = list(string.ascii_lowercase)

    # when nlevel > 26, the alphabetical character is used up, need to generate extra strings as categorical data
    if n_fold > 0:
        for i in range(2, n_fold + 2):
            pool_candidate = list(
                itertools.product(string.ascii_lowercase, repeat=i))
            cat_pool.extend([''.join(w) for w in pool_candidate])
            if len(cat_pool) > nlevels:
                break

    cat = random.sample(cat_pool, nlevels)
    data = np.random.choice(cat, shape, replace=True)

    # make sure the data frame has nlevel different categories
    while len(np.unique(data)) != nlevels:
        data = np.random.choice(cat, shape, replace=True)

    corruptor = Corruptor(data, thr=thr, dtype=np.str)
    raw_data = getattr(corruptor, missingness)()
    return raw_data
Beispiel #5
0
def mnist(missingness="mcar", thr=0.2):
    """ Loads corrupted MNIST

    Parameters
    ----------
    missingness: ('mcar', 'mar', 'mnar')
        Type of missigness you want in your dataset
    th: float between [0,1]
        Percentage of missing data in generated data

    Returns
    -------
    numpy.ndarray
    """
    from sklearn.datasets import fetch_mldata
    dataset = fetch_mldata('MNIST original')
    corruptor = Corruptor(dataset.data, thr=thr)
    data = getattr(corruptor, missingness)()
    return {"X": data, "Y": dataset.target}
Beispiel #6
0
    CER_result_1 = pd.DataFrame(data=None,
                                columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"],
                                index=[5, 10, 20, 30, 40])

    for impute_radio in [0.05, 0.1, 0.2, 0.3, 0.4]:
        mae_df = pd.DataFrame(data=None,
                              columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"],
                              index=np.arange(0, 20))
        cer_df = pd.DataFrame(data=None,
                              columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"],
                              index=np.arange(0, 20))
        cer_df_1 = pd.DataFrame(data=None,
                                columns=['MEAN', 'MICE', 'EM', 'KNN_3', "SOM"],
                                index=np.arange(0, 20))
        for j in range(20):
            corruptor = Corruptor(x_init, impute_radio)
            x_miss = getattr(corruptor, "mcar")()
            print(len(list(x_miss[np.isnan(x_miss)])))
            x_miss = np.vstack((x_miss, reference_x))
            col_num = 0
            #mean、mice插补
            for i in ['mean', 'mice']:
                print(i, '插补---------------------------------')
                x_impute = impute(x_miss, i)[range(x_init.shape[0])]
                mae_df.iloc[j, col_num] = sum(sum(np.abs(x_impute - x_init)))
                cer_df.iloc[j, col_num] = CER(x_impute, x_init, y_init, 1)
                cer_df_1.iloc[j, col_num] = CER(x_impute, x_init, y_init, 2)
                col_num += 1
            #KNN插补(n=3、5、10)
            for n in [11]:
                print('KNN-', n, '插补---------------------------------')
Beispiel #7
0
def MLP_impute(x,y):
    for i, j in np.argwhere(np.isnan(x)):
        temp = np.delete(x,[j],axis=1)
        temp = np.array(pd.DataFrame(np.hstack([temp,y.reshape(-1,1)]).dropna(axis=0)))
        y_train = temp[:, -1]
        x_train = np.delete(temp,[-1],axis=1)





if __name__ == '__main__':
    os.chdir("..\\datas")
    x_init = joblib.load('imputation_x.joblib')
    y_init = joblib.load('imputation_y.joblib')
    reference_x = joblib.load('reference_x.joblib')
    reference_y = joblib.load('reference_y.joblib')

    corruptor = Corruptor(x_init, 0.1)
    x_miss = getattr(corruptor, "mcar")()
    x_miss = np.vstack((x_init, reference_x))
    y_miss = pd.concat((y_init,reference_y),axis=0)

    data_df = pd.DataFrame(pd.concat((pd.DataFrame(x_miss),y_miss.reset_index(drop=True)),axis=1))
    print(data_df.head())
    x_train,x_test,y_train,y_test = train_test_split(np.array(data_df.dropna(axis=0,how='any').drop(columns='总分')),
                                                     np.array(data_df.dropna(axis=0,how='any')['总分']),test_size=0.2, random_state=7)

    w,b = SLP_train(x_train,y_train,learning_rate=0.001,epochs=100,batch_size=10)