Beispiel #1
0
def test_knn_imputation_zero_p2():
    # Test with an imputable matrix and also compare with missing_values="NaN"
    X_zero = np.array([
        [1, 0, 1, 1, 1.],
        [2, 2, 2, 2, 2],
        [3, 3, 3, 3, 0],
        [6, 6, 0, 6, 6],
    ])

    X_nan = np.array([
        [1, np.nan, 1,      1,      1.],
        [2, 2,      2,      2,      2],
        [3, 3,      3,      3,      np.nan],
        [6, 6,      np.nan, 6,      6],
    ])
    statistics_mean = np.nanmean(X_nan, axis=0)

    X_imputed = np.array([
        [1, 2.5,    1,   1, 1.],
        [2, 2,      2,   2, 2],
        [3, 3,      3,   3, 1.5],
        [6, 6,      2.5, 6, 6],
    ])

    imputer_zero = KNNImputer(missing_values=0, n_neighbors=2,
                              weights="uniform")

    imputer_nan = KNNImputer(missing_values="NaN",
                             n_neighbors=2,
                             weights="uniform")

    assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed)
    assert_array_equal(imputer_zero.statistics_, statistics_mean)
    assert_array_equal(imputer_zero.fit_transform(X_zero),
                       imputer_nan.fit_transform(X_nan))
Beispiel #2
0
def test_weight_uniform():
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     10]
    ])

    # Test with "uniform" weight (or unweighted)
    X_imputed_uniform = np.array([
        [0,      0],
        [5,      2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     10]
    ])

    imputer = KNNImputer(weights="uniform")
    assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)

    # Test with "callable" weight
    def no_weight(dist=None):
        return None

    imputer = KNNImputer(weights=no_weight)
    assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
Beispiel #3
0
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs):
    """
    Impute missing values in DataFrame (np.nan or None).
    ------------------------
    Args:
        * df: pd.DataFrame of (samples x features)
        * method: string for what method of imputation to use
            ** 'mean': mean imputation
            ** 'knn': K-NN imputation (see missingpy.KNNImputer)
            ** 'rf': random forest imputation (see missingpy.MissForest)

    Returns:
        * pd.DataFrame: imputed values (samples x features)
    """
    assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method)

    if method=='mean':
        return df.fillna(df.mean(0))
    elif method=='knn':
        X = df.values
        imputer = KNNImputer(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
    elif method=='rf':
        X = df.values
        imputer = MissForest(**kwargs)
        X_impute = imputer.fit_transform(X)
        return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
Beispiel #4
0
def test_knn_n_neighbors():

    X = np.array([
        [0,       0],
        [np.nan,  2],
        [4,       3],
        [5,       np.nan],
        [7,       7],
        [np.nan,  8],
        [14,      13]
    ])
    statistics_mean = np.nanmean(X, axis=0)

    # Test with 1 neighbor
    X_imputed_1NN = np.array([
        [0,      0],
        [4,      2],
        [4,      3],
        [5,      3],
        [7,      7],
        [7,      8],
        [14,     13]
    ])

    n_neighbors = 1
    imputer = KNNImputer(n_neighbors=n_neighbors)

    assert_array_equal(imputer.fit_transform(X), X_imputed_1NN)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with 6 neighbors
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      np.nan],
        [7,      7],
        [np.nan, 8],
        [14,      13]
    ])

    X_imputed_6NN = np.array([
        [0,      0],
        [6,      2],
        [4,      3],
        [5,      5.5],
        [7,      7],
        [6,      8],
        [14,     13]
    ])

    n_neighbors = 6
    imputer = KNNImputer(n_neighbors=6)
    imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1)

    assert_array_equal(imputer.fit_transform(X), X_imputed_6NN)
    assert_array_equal(imputer.statistics_, statistics_mean)
    assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit(
        X).transform(X))
Beispiel #5
0
def test_callable_metric():

    # Define callable metric that returns the l1 norm:
    def custom_callable(x, y, missing_values="NaN", squared=False):
        x = np.ma.array(x, mask=np.isnan(x))
        y = np.ma.array(y, mask=np.isnan(y))
        dist = np.nansum(np.abs(x-y))
        return dist

    X = np.array([
        [4, 3, 3, np.nan],
        [6, 9, 6, 9],
        [4, 8, 6, 9],
        [np.nan, 9, 11, 10.]
    ])

    X_imputed = np.array([
        [4, 3, 3, 9],
        [6, 9, 6, 9],
        [4, 8, 6, 9],
        [5, 9, 11, 10.]
    ])

    imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
    assert_array_equal(imputer.fit_transform(X), X_imputed)
Beispiel #6
0
def test_complete_features():

    # Test with use_complete=True
    X = np.array([
        [0,      np.nan,    0,       np.nan],
        [1,      1,         1,       np.nan],
        [2,      2,         np.nan,  2],
        [3,      3,         3,       3],
        [4,      4,         4,       4],
        [5,      5,         5,       5],
        [6,      6,         6,       6],
        [np.nan, 7,         7,       7]
    ])

    r0c1 = np.mean(X[1:6, 1])
    r0c3 = np.mean(X[2:-1, -1])
    r1c3 = np.mean(X[2:-1, -1])
    r2c2 = np.nanmean(X[:6, 2])
    r7c0 = np.mean(X[2:-1, 0])

    X_imputed = np.array([
        [0,     r0c1,   0,    r0c3],
        [1,     1,      1,    r1c3],
        [2,     2,      r2c2, 2],
        [3,     3,      3,    3],
        [4,     4,      4,    4],
        [5,     5,      5,    5],
        [6,     6,      6,    6],
        [r7c0,  7,      7,    7]
    ])

    imputer_comp = KNNImputer()
    assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
Beispiel #7
0
def impute_times(final,
                 times_open,
                 times_closed,
                 columns,
                 imputation_method="mean"):
    """
    Impute open work items times with different methods
    :param final: Complete preprocessed dataframe
    :param times_open: Dataframe of work items that are not closed
    :param times_closed: Dataframe of work items that are closed
    :param columns: Columns to impute
    :param imputation_method: Choose between 'mean', 'KNN', 'forest'
    :return: Dataframe of open work items with imputed values
    """
    if imputation_method == "mean":
        for col in columns:
            mean = times_closed[col].mean()
            mask = (times_open[col] == 0)
            times_open[col].mask(mask, mean, inplace=True)
    if imputation_method in ["KNN", "forest"]:
        if imputation_method == "KNN":
            imputer = KNNImputer(missing_values=0, col_max_missing=0.9)
        if imputation_method == "forest":
            imputer = MissForest(missing_values=0)
        for col in columns:
            try:
                val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0]
                other = pd.DataFrame(index=final.index,
                                     data=val,
                                     columns=[col])
                mask = (times_open[col] == 0)
                times_open.loc[mask, col] = other
            except ValueError:
                imputer = KNNImputer(missing_values=0, col_max_missing=0.99)
    return times_open
def pre_processing(data_route):

    data_frame = pd.read_csv(data_route)
    #Missing Value Imputation by Random Forest
    real_colums = data_frame.columns

    def handle_column_negative(x):
        return x.map(lambda x: x * (-1) if x < 0 else x)

    numericData = data_frame.copy()
    # Preparing data to Random Forest
    numericData = numericData.drop(["cluster", "date", "country"], axis=1)
    numericData = numericData.apply(lambda x: handle_column_negative(x),
                                    axis=1)
    numericData = numericData.replace([np.inf, -np.inf], np.nan)

    # applying random forest
    random_forest_imputer = KNNImputer()
    random_forest_result = random_forest_imputer.fit_transform(numericData)
    data_frame_processed = pd.DataFrame(random_forest_result)

    # adding removed fields
    data_frame_processed.insert(0, column='date', value=data_frame['date'])
    data_frame_processed.insert(0,
                                column='cluster',
                                value=data_frame['cluster'])
    data_frame_processed.insert(0,
                                column='country',
                                value=data_frame['country'])

    data_frame_processed.columns = real_colums
    return data_frame_processed
def imputate_using_knn(dataset, k):
    cols = dataset.columns
    knn_impu = KNNImputer(n_neighbors=k, weights="uniform")
    result = knn_impu.fit_transform(dataset)

    result = pd.DataFrame(result)

    result.columns = cols
    return result
Beispiel #10
0
def knn_impute(data, n_neighbors=3):

    imputer = KNNImputer(n_neighbors=n_neighbors,
                         missing_values=np.nan,
                         weights='distance')

    imputed_df = pd.DataFrame(imputer.fit_transform(data))

    imputed_df.columns = data.columns

    return (imputed_df)
def outlier_treatment(train_data_frame):
    numericData = train_data_frame.loc[:, "expenses":"volume"]
    cleaned_data = numericData.copy()
    cleaned_data[~(np.abs(stats.zscore(cleaned_data)) < 3).all(
        axis=1)] = np.nan
    imputer = KNNImputer()
    result = imputer.fit_transform(cleaned_data)
    cdp = pd.DataFrame(result)
    cdp.insert(0, column='date', value=train_data_frame['date'])
    cdp.insert(0, column='cluster', value=train_data_frame['cluster'])
    cdp.insert(0, column='country', value=train_data_frame['country'])
    cdp.columns = train_data_frame.columns.copy()
    return cdp
Beispiel #12
0
def test_knn_imputation_shape():
    # Verify the shapes of the imputed matrix for different weights and
    # number of neighbors.
    n_rows = 10
    n_cols = 2
    X = np.random.rand(n_rows, n_cols)
    X[0, 0] = np.nan

    for weights in ['uniform', 'distance']:
        for n_neighbors in range(1, 6):
            imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (n_rows, n_cols))
Beispiel #13
0
    def do_impute(self, matrix_to_impute):
        parameter_set = self.get_parameter_set()
        np.savetxt('test_cur_matrix_missing.csv', matrix_to_impute)
        if self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_randomforest:
            imputed_cur_matrix = np.transpose(
                self.rfimpute.miss_forest_imputation(
                    np.transpose(matrix_to_impute)))
        elif self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_knn:
            imputer = KNNImputer(n_neighbors=2,
                                 row_max_missing=1,
                                 col_max_missing=1)
            imputed_cur_matrix = np.transpose(
                imputer.fit_transform(np.transpose(matrix_to_impute)))

        return imputed_cur_matrix
Beispiel #14
0
def test_complete_features_weighted():

    # Test with use_complete=True
    X = np.array([
        [0,      0,     0,       np.nan],
        [1,      1,     1,       np.nan],
        [2,      2,     np.nan,  2],
        [3,      3,     3,       3],
        [4,      4,     4,       4],
        [5,      5,     5,       5],
        [6,      6,     6,       6],
        [np.nan, 7,     7,       7]
    ])

    dist = pairwise_distances(X,
                              metric="masked_euclidean",
                              squared=False)

    # Calculate weights
    r0c3_w = 1.0 / dist[0, 2:-1]
    r1c3_w = 1.0 / dist[1, 2:-1]
    r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
    r7c0_w = 1.0 / dist[7, 2:7]

    # Calculate weighted averages
    r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
    r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
    r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
    r7c0 = np.average(X[2:7, 0], weights=r7c0_w)

    X_imputed = np.array([
        [0,     0,  0,    r0c3],
        [1,     1,  1,    r1c3],
        [2,     2,  r2c2, 2],
        [3,     3,  3,    3],
        [4,     4,  4,    4],
        [5,     5,  5,    5],
        [6,     6,  6,    6],
        [r7c0,  7,  7,    7]
    ])

    imputer_comp_wt = KNNImputer(weights="distance")
    assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)
Beispiel #15
0
def impute_missing_for_dataframe(dataframe, target='job_performance'):
    """ The imputer function should be used on a dataframe that has already been numerically encoded """
    from missingpy import KNNImputer #, MissForest
    
    X = dataframe.loc[:, dataframe.columns != target].values
    y = dataframe[target].values

    # imputer object
    knn = KNNImputer(n_neighbors=5, 
                    weights="uniform",
                    metric="masked_euclidean",
                    row_max_missing=0.8,
                    col_max_missing=0.8, 
                    copy=True)
    knn_missing_imputation = knn.fit_transform(X)
    imputed_dataframe = pd.DataFrame(knn_missing_imputation, 
                                     columns = dataframe.columns[dataframe.columns != target])
    imputed_dataframe[target] = pd.Series(y)
    return imputed_dataframe
Beispiel #16
0
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data,
                    var, min_vals, max_vals):

    XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)),
                                         axis=1)
    XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)),
                                        axis=1)

    imputer = KNNImputer(n_neighbors=5)
    XY_completed_train = imputer.fit_transform(XY_incomplete_train)
    XY_completed_test = imputer.transform(XY_incomplete_test)

    X_train_imp = (XY_completed_train[:, 0:data.shape[1]])
    y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]],
                                dtype="int16")
    y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5,
                           dtype="int16")
    X_test_imp = (XY_completed_test[:, 0:data.shape[1]])
    y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5,
                          dtype="int16")
    y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]],
                               dtype="int16")

    for j in range(0, X_train_imp.shape[1]):
        if var.iloc[j]['type'] == 'cat':
            X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]),
                                        min_vals[j], max_vals[j])
            X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j],
                                       max_vals[j])
        else:
            X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1)
            X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1)

    #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)
    #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0)

    return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig,
            y_test_imp_orig)
Beispiel #17
0
    def create_2d_velocity_field(self,
                                 radii,
                                 v_rot,
                                 n_interp_r=150,
                                 n_interp_theta=150):
        '''
        uses tilted ring model parameters to calculate velocity field
        using eqn 1-3 of 1709.02049 and v_rot from mass model

        it is easier to loop through polar coordinates and then map the v_los to the
        nearest x,y point

        returns 2d velocity field array
        '''
        v_field = np.empty(shape=(self.image_ydim, self.image_xdim))
        v_field[:] = np.nan
        v_rot_interp = interp1d(radii, v_rot)
        radii_interp = np.linspace(np.min(radii), np.max(radii), n_interp_r)
        for r in radii_interp:
            v = v_rot_interp(r)
            for theta in np.linspace(0, 2. * np.pi, n_interp_theta):
                x, y, v_los = self._calc_v_los_at_r_theta(v, r, theta)
                if (self.image_xdim - 1 > x > 0 and y < self.image_ydim - 1
                        and y > 0):
                    arr_x, arr_y = int(np.round(x, 0)), int(np.round(y, 0))
                    try:
                        v_field[arr_y][arr_x] = v_los
                    except:
                        print(arr_x, arr_y, v_los)
        near_neighbors_mask = create_blurred_mask(v_field)
        imputer = KNNImputer(n_neighbors=3, weights="distance")
        v_field = imputer.fit_transform(
            np.where(near_neighbors_mask == 1, v_field, 0.))
        v_field[v_field == 0] = np.nan

        # rotate to match the fits data field
        v_field = np.rot90(v_field, 3)
        return v_field
Beispiel #18
0
def clean_dragon(save=False):
    source = os.path.join(DATA_DIR, "cids-smiles-dragon.txt")
    df = pd.read_csv(source).set_index("CID")
    df = df.iloc[:, 1:]  # Drop SMILES column

    # Scale to mean 0, variance 1
    ss = StandardScaler()
    good = df.columns[df.isnull().sum() < 500]
    df = df[good]
    scaled = ss.fit_transform(df.astype("float"))
    df = pd.DataFrame(scaled, index=df.index, columns=df.columns)

    # Impute missing values
    knn = KNNImputer(k=5)
    imputed = knn.fit_transform(df.values)
    df = pd.DataFrame(imputed, index=df.index, columns=df.columns)

    # Optionally save to disk
    if save:
        dest = os.path.join(DATA_DIR, "cids-smiles-dragon-scaled-imputed.txt")
        df.to_csv(dest)

    return df
Beispiel #19
0
        Mask[index1, index2] = 0
    Missing = Image.fromarray(rgbArray)
    plt.imshow(Missing)
    plt.show()
    return out


SelectedImage = showImagesRandomImages(
    3)  #select and image randomly from MNSIT dataset
missingPercentage = 0.2  # missing rate percentage
missingImage = generateMissingFig(
    SelectedImage,
    missingPercentage)  #inserting missing values to the original image

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputed_by_KNN = imputer.fit_transform(missingImage)
KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN)
#plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = MissForest()
MissForest_imputed = imputer.fit_transform(missingImage)
MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed)
#plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()

imputer = IterativeImputer()
MICE_imputed = imputer.fit_transform(missingImage)
MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed)
#plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1)
#plt.show()
Beispiel #20
0
)

# taking the target values in one dataframe from all dataset
dt = data[data.columns[len(data.columns) - 1]]
target = dt

# the data without target values
data = data[data.columns[:64]]

# standard scalar(z-score) used for feature selection
scaler = StandardScaler()
scaled_df = scaler.fit_transform(data)

# for knn imputation
imputer = KNNImputer(n_neighbors=5, weights="distance")
imputed_data = imputer.fit_transform(scaled_df)

# used stratified k fold cross validation
skf = StratifiedKFold(n_splits=k_fold)

# SVM
print("SVM")

svm_acc = 0
svm_spe = 0
svm_sen = 0

# k fold cross validation loop
for train, test in skf.split(imputed_data, target):

    # divide dataser in training and testing
Beispiel #21
0
    def predict_age(self, exprdata, genelength=None, chronage=None):
        """Calculate RNA age.

        This function calculates RNA age based on pre-trained predictors.

        :param exprdata: a pandas DataFrame which contains gene expression data
        with each row represents a gene and each column represents a sample.
        Use the argument "exprtype" to specify raw count or FPKM. The index of
        "exprdata" should be gene ids and columns names of "exprdata" should
        be sample ids.

        :param genelength: a pandas Series, DataFrame, numpy array, or list
        which contains gene length in bp. The size of genelength should be
        equal to the number of rows in exprdata. This argument is optional.
        If using exprtype="FPKM", genelength argument is ignored. If using
        exprtype="count", the raw count will be converted to FPKM. If
        genelength is provided, the function will convert raw count to FPKM
        according to the user-supplied gene length. Otherwise, gene length
        is obtained from the internal database.

        :param chronage: a pandas DataFrame which contains the chronological
        age of each sample. This argument is optional. If provided, it should
        be a DataFrame with 1st column sample id and 2nd column chronological
        age. The sample order in chronage doesn't have to be in the same order
        as in exprdata. However, the samples in chronage and exprdata should
        be the same. If some samples' chronological age are not available,
        users are expected to set the chronological age in chronage to NaN.
        If chronage contains more than 2 columns, only the first 2 columns
        will be considered. If this argument is not provided, the age
        acceleration residual will not be calculated. See package tutorial
        for the definition of age acceleration residual.

        :return: a pandas DataFrame contains RNA age.

        """

        # check input:
        assert isinstance(exprdata, pd.DataFrame), \
            "exprdata should be a pandas DataFrame."
        assert exprdata.applymap(np.isreal).all().all(),\
            "Only numeric values are allowed in the exprdata DataFrame."
        assert list(exprdata.index) != list(range(exprdata.shape[0])), \
            "The index of exprdata should be gene ids."
        assert list(exprdata.columns) != list(range(exprdata.shape[1])), \
            "The column names of exprdata should be sample ids."
        assert ~np.any(exprdata.index.duplicated()), \
            "Duplicated gene names found in exprdata."
        assert (exprdata >= 0).all().all(), \
            "Gene expression data cannot contain negative value(s)."

        if chronage is not None:
            assert isinstance(chronage, pd.DataFrame), \
                "chronage should be a pandas DataFrame."
            if (chronage.shape[1] > 2):
                print("More than 2 columns are provided in chronage. "
                      "Only the first 2 columns will be used.")
            # assert ~chronage.applymap(np.isreal).all()[0], \
            #    "The 1st column in chronage should be sample ids."
            assert chronage.applymap(np.isreal).all()[1], \
                "The 2nd column in chronage should be chronological age."
            assert ~any(chronage.iloc[:, 0].duplicated()), \
                "chronage contains duplicated sample ids."
            assert set(chronage.iloc[:, 0].astype(str)) == \
                set(exprdata.columns), \
                "Samples in chronage and exprdata should be the same."
            assert ~np.any(chronage.iloc[:, 1] < 0), \
                "Chronological age contains negative value(s)."

        if self._exprtype == "count":
            exprdata = self._count2FPKM(exprdata, genelength)

        if self.idtype != "symbol":
            mg = mygene.MyGeneInfo()
            genes = list(exprdata.index)
            temp = mg.querymany(genes,
                                scopes=self.idtype,
                                fields='symbol',
                                species='human',
                                returnall=True,
                                as_dataframe=True)["out"]
            temp = temp.loc[~temp["symbol"].isna(), "symbol"]
            temp = temp[~temp.index.duplicated(keep="first")]
            temp = temp.drop_duplicates(keep=False)
            genesymbol = temp[exprdata.index]
            genesymbol[genesymbol.isna()] = "unknown"
            exprdata.index = genesymbol

        location = os.path.dirname(os.path.realpath(__file__))
        if self.stype == "all":
            tempPath = os.path.join(
                location, "internal_data", "all",
                "coef_{}_{}.csv".format(self._tissue, self._signature))
        else:
            tempPath = os.path.join(
                location, "internal_data", "Caucasian",
                "coef_{}_{}.csv".format(self._tissue, self._signature))

        sig_internal = pd.read_csv(tempPath, index_col=0)
        genes_required = sig_internal.index[1:]
        sig_in_expr = genes_required.isin(exprdata.index)
        # full NA row
        if np.sum(~sig_in_expr) != 0:
            print("{:.2f}% genes in the gene signature are not included in "
                  "the supplied gene expression.".format(
                      np.sum(~sig_in_expr) / len(genes_required) * 100))

            # impute the gene expression in the log scale
            tempmat = pd.DataFrame(columns=exprdata.columns,
                                   index=genes_required[~sig_in_expr])

            exprdata_withNA = pd.concat([exprdata, tempmat], axis=0)
            exprdata_log = np.log2(exprdata_withNA.apply(pd.to_numeric) + 1)
            ind1 = exprdata_log.isna().all(axis=1)
            ind2 = ~exprdata_log.isna().any(axis=1)
            exprdata_log.loc[(ind1 | ind2), :] = \
                exprdata_log.loc[(ind1 | ind2), :].fillna(exprdata_log.mean())
        else:
            exprdata_log = np.log2(exprdata.apply(pd.to_numeric) + 1)

        # check partial NA
        if ~exprdata_log.notnull().all().all():
            # impute the gene expression in the log scale
            imputer = KNNImputer(n_neighbors=min(10, exprdata_log.shape[1]),
                                 row_max_missing=1,
                                 col_max_missing=1)
            X_imputed = imputer.fit_transform(exprdata_log.transpose())
            exprdata_log_impute = pd.DataFrame(X_imputed).transpose()
            exprdata_log_impute.index = exprdata_log.index
            exprdata_sub = exprdata_log_impute.loc[genes_required, :]
        else:
            exprdata_sub = exprdata_log.loc[genes_required, :]

        RNAAge = exprdata_sub.apply(lambda x: np.sum(
            x.multiply(sig_internal.iloc[1:, 0])) + sig_internal.iloc[0, 0])
        res = pd.DataFrame(index=exprdata.columns)
        res["RNAAge"] = list(RNAAge)

        if chronage is not None:
            chronage.index = chronage.iloc[:, 0]
            res["ChronAge"] = chronage.loc[res.index].iloc[:, 1]
            # if sample size is too small, age acceleration residual
            # cannot be calculated
            if res.dropna().shape[0] > 30:
                Y = res["RNAAge"]
                X = res["ChronAge"]
                X = sm.add_constant(X)
                model = sm.OLS(Y, X).fit()
                res["AgeAccelResid"] = model.resid

        return res
Beispiel #22
0

# In[6]:
# Dummy variable with one hot-coded - Gender and Embarked
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)

train = ohe.fit_transform(train)
train = train.drop(['Sex_female'], axis=1) 

# In[7]:
# Impute with KNN - Age
from missingpy import KNNImputer
imputer = KNNImputer()
X_imputed = imputer.fit_transform(train)

train_knn = pd.DataFrame(pd.DataFrame(X_imputed))
train_knn.columns = train.columns


# In[8]:


age_floor = list(map(lambda x: math.floor(x), train_knn.Age))
train_knn['Age'] = age_floor


# ## model 1: Random Forest using train_knn (imputation of Age with knn)

# In[9]:
Beispiel #23
0
def test_knn_imputation_default():
    # Test imputation with default parameter values

    # Test with an imputable matrix
    X = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      np.nan],
        [3,      2,      3,      np.nan],
        [np.nan, 4,      5,      5],
        [6,      np.nan, 6,      7],
        [8,      8,      8,      8],
        [16,     15,     18,    19],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    X_imputed = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      8],
        [3,      2,      3,      8],
        [4,      4,      5,      5],
        [6,      3,      6,      7],
        [8,      8,      8,      8],
        [16,     15,     18,    19],
    ])

    imputer = KNNImputer()
    assert_array_equal(imputer.fit_transform(X), X_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with % missing in row > row_max_missing
    X = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      np.nan],
        [3,      2,      3,      np.nan],
        [np.nan, 4,      5,      5],
        [6,      np.nan, 6,      7],
        [8,      8,      8,      8],
        [19,     19,     19,     19],
        [np.nan, np.nan, np.nan, 19],
    ])
    statistics_mean = np.nanmean(X, axis=0)
    r7c0, r7c1, r7c2, _ = statistics_mean

    X_imputed = np.array([
        [1,      0,      0,      1],
        [2,      1,      2,      8],
        [3,      2,      3,      8],
        [4,      4,      5,      5],
        [6,      3,      6,      7],
        [8,      8,      8,      8],
        [19,     19,     19,     19],
        [r7c0,   r7c1,   r7c2,   19],
    ])

    imputer = KNNImputer()
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
    assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6)

    # Test with all neighboring donors also having missing feature values
    X = np.array([
        [1, 0, 0, np.nan],
        [2, 1, 2, np.nan],
        [3, 2, 3, np.nan],
        [4, 4, 5, np.nan],
        [6, 7, 6, np.nan],
        [8, 8, 8, np.nan],
        [20, 20, 20, 20],
        [22, 22, 22, 22]
    ])
    statistics_mean = np.nanmean(X, axis=0)

    X_imputed = np.array([
        [1, 0, 0, 21],
        [2, 1, 2, 21],
        [3, 2, 3, 21],
        [4, 4, 5, 21],
        [6, 7, 6, 21],
        [8, 8, 8, 21],
        [20, 20, 20, 20],
        [22, 22, 22, 22]
    ])

    imputer = KNNImputer()
    assert_array_equal(imputer.fit_transform(X), X_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test when data in fit() and transform() are different
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,     16]
    ])
    statistics_mean = np.nanmean(X, axis=0)

    Y = np.array([
        [1,      0],
        [3,      2],
        [4,      np.nan]
        ])

    Y_imputed = np.array([
        [1,      0],
        [3,      2],
        [4,      4.8]
        ])

    imputer = KNNImputer()
    assert_array_equal(imputer.fit(X).transform(Y), Y_imputed)
    assert_array_equal(imputer.statistics_, statistics_mean)
    abs(reDataBool.loc[idx[0], idx[1]] ^ dataBool.loc[idx[0], idx[1]])
    for idx in randIdxBool
])

#------------------------------------------------------------------------------
#-------------------------Nearest Neighbor Imputation--------------------------
#------------------------------------------------------------------------------

n_neighbors = 5
nan = np.nan

#numerical data
imputerNum = KNNImputer(missing_values=nan,
                        n_neighbors=n_neighbors,
                        weights="distance")
impDataNum = imputerNum.fit_transform(dataNumVal)
impDataNum = pd.DataFrame(impDataNum, columns=dataNumVal.columns)

# residuals sum of squared errors for the imputed missing values:
rssImpNum = sum([
    (impDataNum.loc[idx[0], idx[1]] - dataNumNorm.loc[idx[0], idx[1]])**2
    for idx in randIdxNum
])

# scale back to normal
impDataMax = impDataNum.max(axis=0)
impDataMin = impDataNum.min(axis=0)
impScDataNum = (impDataNum - impDataMin) * (dataNumMax - dataNumMin) / (
    impDataMax - impDataMin) + dataNumMin

# insert imputed data in missing values
Beispiel #25
0
def test_weight_distance():
    X = np.array([
        [0,      0],
        [np.nan, 2],
        [4,      3],
        [5,      6],
        [7,      7],
        [9,      8],
        [11,    10]
    ])

    # Test with "distance" weight

    # Get distance of "n_neighbors" neighbors of row 1
    dist_matrix = pairwise_distances(X, metric="masked_euclidean")

    index = np.argsort(dist_matrix)[1, 1:6]
    dist = dist_matrix[1, index]
    weights = 1 / dist
    values = X[index, 0]
    imputed = np.dot(values, weights) / np.sum(weights)

    # Manual calculation
    X_imputed_distance1 = np.array([
        [0,                 0],
        [3.850394,          2],
        [4,                 3],
        [5,                 6],
        [7,                 7],
        [9,                 8],
        [11,                10]
    ])

    # NearestNeighbor calculation
    X_imputed_distance2 = np.array([
        [0,                 0],
        [imputed,           2],
        [4,                 3],
        [5,                 6],
        [7,                 7],
        [9,                 8],
        [11,                10]
    ])

    imputer = KNNImputer(weights="distance")
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1,
                              decimal=6)
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2,
                              decimal=6)

    # Test with weights = "distance" and n_neighbors=2
    X = np.array([
        [np.nan, 0,      0],
        [2,      1,      2],
        [3,      2,      3],
        [4,      5,      5],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    X_imputed = np.array([
        [2.3828, 0,     0],
        [2,      1,     2],
        [3,      2,     3],
        [4,      5,     5],
    ])

    imputer = KNNImputer(n_neighbors=2, weights="distance")
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed,
                              decimal=4)
    assert_array_equal(imputer.statistics_, statistics_mean)

    # Test with varying missingness patterns
    X = np.array([
        [1,         0,          0,  1],
        [0,         np.nan,     1,  np.nan],
        [1,         1,          1,  np.nan],
        [0,         1,          0,  0],
        [0,         0,          0,  0],
        [1,         0,          1,  1],
        [10,        10,         10, 10],
    ])
    statistics_mean = np.nanmean(X, axis=0)

    # Get weights of donor neighbors
    dist = masked_euclidean_distances(X)
    r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
    r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
    r1c1_nbor_wt = (1/r1c1_nbor_dists)
    r1c3_nbor_wt = (1 / r1c3_nbor_dists)

    r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
    r2c3_nbor_wt = 1/r2c3_nbor_dists

    # Collect donor values
    col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
    col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()

    # Final imputed values
    r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
    r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
    r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)

    print(r1c1_imp, r1c3_imp, r2c3_imp)
    X_imputed = np.array([
        [1,         0,          0,  1],
        [0,         r1c1_imp,   1,  r1c3_imp],
        [1,         1,          1,  r2c3_imp],
        [0,         1,          0,  0],
        [0,         0,          0,  0],
        [1,         0,          1,  1],
        [10,        10,         10, 10],
    ])

    imputer = KNNImputer(weights="distance")
    assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6)
    assert_array_equal(imputer.statistics_, statistics_mean)
from missingpy import KNNImputer

import pandas as pd

df = pd.read_csv("원본_nan.csv")
# print(df)
# imputer = KNNImputer(n_neighbors=2,weights="uniform") ##최고성능
imputer = KNNImputer(n_neighbors=2,
                     weights="uniform",
                     col_max_missing=0.9,
                     row_max_missing=0.9)
X_imputed = imputer.fit_transform(df)
print(X_imputed)

# df2=pd.DataFrame(X_imputed, columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome'])
# print(df2['Insulin'])
# df2.to_csv('test_imputed.csv', index=False, encoding='cp949', mode='w')

#https://github.com/epsilon-machine/missingpy
Beispiel #27
0
#%%
print(df['Embarked'].value_counts())
print(df['Sex'].value_counts())
#%%
#droping nulls from embarked alone
df.dropna(subset=['Embarked'], axis=0, inplace=True)

#%%
print(df.shape)
#%%
from missingpy import KNNImputer

imputer = KNNImputer(n_neighbors=2, weights="uniform")

X_imputed = imputer.fit_transform(df)
df['Age_imputed'] = X_imputed[:, 3].round()
print(df[df['Age'].isnull()])
#%%
df.drop(['Age'], axis=1, inplace=True)
sns.boxplot(df['Age_imputed'])
plt.show()
#%%
print(df['Age_imputed'].describe())


#%%
#age_imputed to bins
def func3(x):
    if x < 21:
        return 0
def imputeKNN(data, **kwargs):
    imputer = KNNImputer(**kwargs)
    imputedData = imputer.fit_transform(data)
    imputedData = pd.DataFrame(imputedData, index=data.index, columns=data.columns)
    return imputedData
def imputeMatrix(dataM):
	imputer = KNNImputer(n_neighbors=10)
	dataT = imputer.fit_transform(dataM)
	return dataT
Beispiel #30
0
df[basement_details] = df[basement_details].fillna('NoBsmt')

print(
    'As the number of missing data for the following variables are low, we will just be dropping the observations that have missing data for these variables.'
)
df = df.dropna(how='any', subset=['MasVnrType', 'MasVnrArea', 'Electrical'])

print(
    'For the variable LotFrontage, we we will be using K-Nearest Neighbours to impute the missing data.'
)
from missingpy import KNNImputer
imputer = KNNImputer(n_neighbors=5,
                     weights='distance',
                     metric='masked_euclidean')

df.LotFrontage = imputer.fit_transform(
    np.array(df.drop('FireplaceQu', axis=1).LotFrontage).reshape(-1, 1))

df.FireplaceQu = df.FireplaceQu.fillna('NoFireplc')

print('Checking for any more columns with missing data ...')
missing_pct(df)

### Checking Data Types ###
print('Changing numeric data to categorical ...')
df = df.replace({
    'MSSubClass': {
        20: "SC20",
        30: "SC30",
        40: "SC40",
        45: "SC45",
        50: "SC50",