Beispiel #1
0
def test_iterative_imputer_truncated_normal_posterior():
    #  test that the values that are imputed using `sample_posterior=True`
    #  with boundaries (`min_value` and `max_value` are not None) are drawn
    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
    #  note that starting from the wrong random seed will make this test fail
    #  because random sampling doesn't occur at all when the imputation
    #  is outside of the (min_value, max_value) range
    pytest.importorskip("scipy", minversion="0.17.0")
    rng = np.random.RandomState(42)

    X = rng.normal(size=(5, 5))
    X[0][0] = np.nan

    imputer = IterativeImputer(min_value=0,
                               max_value=0.5,
                               sample_posterior=True,
                               random_state=rng)

    imputer.fit_transform(X)
    # generate multiple imputations for the single missing value
    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])

    assert all(imputations >= 0)
    assert all(imputations <= 0.5)

    mu, sigma = imputations.mean(), imputations.std()
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    if sigma == 0:
        sigma += 1e-12
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    # we want to fail to reject null hypothesis
    # null hypothesis: distributions are the same
    assert ks_statistic < 0.2 or p_value > 0.1, \
        "The posterior does appear to be normal"
Beispiel #2
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-3,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
Beispiel #3
0
def test_iterative_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    max_iter = 2
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by IterativeImputer

    imputer = IterativeImputer(missing_values=0,
                               max_iter=max_iter,
                               n_nearest_features=5,
                               sample_posterior=False,
                               min_value=0,
                               max_value=1,
                               verbose=1,
                               imputation_order=imputation_order,
                               random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]

    assert (len(ordered_idx) // imputer.n_iter_ ==
            imputer.n_features_with_missing_)

    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d-1]
        ordered_idx_round_2 = ordered_idx[d-1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)
Beispiel #4
0
def test_iterative_imputer_all_missing():
    n = 100
    d = 3
    X = np.zeros((n, d))
    imputer = IterativeImputer(missing_values=0, max_iter=1)
    X_imputed = imputer.fit_transform(X)
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
Beispiel #5
0
def test_iterative_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = IterativeImputer(max_iter=10, random_state=rng)
    m2 = IterativeImputer(max_iter=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
Beispiel #6
0
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
    for triplet in imputer.imputation_sequence_:
        expected_type = (type(estimator) if estimator is not None
                         else type(BayesianRidge()))
        assert isinstance(triplet.estimator, expected_type)
        hashes.append(id(triplet.estimator))

    # check that each estimator is unique
    assert len(set(hashes)) == len(hashes)
Beispiel #7
0
def test_iterative_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 100
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=5,
                               verbose=1,
                               random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.01)
Beispiel #8
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        iterative_imputer = IterativeImputer(initial_strategy=strategy)
        X_imputed = iterative_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
Beispiel #9
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = _sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #10
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #11
0
def exp_mi(xmiss, w, y, regularize, m=10, nuisance=False):

    res_tau_dr = []
    res_tau_ols = []
    res_tau_ols_ps = []
    res_tau_resid = []
    res_ps = np.empty([len(w), 1])
    res_y0 = np.empty([len(y), 1])
    res_y1 = np.empty([len(y), 1])
    for i in range(m):
        imp = IterativeImputer(sample_posterior=True, random_state=i)
        x_imp_mice = imp.fit_transform(xmiss)
        if nuisance:
            tau_tmp, nu_tmp = compute_estimates(x_imp_mice, w, y, regularize,
                                                nuisance)
            res_ps = np.concatenate(
                (res_ps, nu_tmp['ps_hat'].reshape([len(w), 1])), axis=1)
            res_y0 = np.concatenate(
                (res_y0, nu_tmp['y0_hat'].reshape([len(y), 1])), axis=1)
            res_y1 = np.concatenate(
                (res_y1, nu_tmp['y1_hat'].reshape([len(y), 1])), axis=1)
        else:
            tau_tmp = compute_estimates(x_imp_mice, w, y, regularize)
        res_tau_dr.append(tau_tmp['tau_dr'])
        res_tau_ols.append(tau_tmp['tau_ols'])
        res_tau_ols_ps.append(tau_tmp['tau_ols_ps'])
        res_tau_resid.append(tau_tmp['tau_resid'])

    if nuisance:
        return {
            'tau_dr': np.mean(res_tau_dr),
            'tau_ols': np.mean(res_tau_ols),
            'tau_ols_ps': np.mean(res_tau_ols_ps),
            'tau_resid': np.mean(res_tau_resid),
        }, {
            'ps_hat': np.mean(res_ps[:, 1:], axis=1),
            'y0_hat': np.mean(res_y0[:, 1:], axis=1),
            'y1_hat': np.mean(res_y1[:, 1:], axis=1),
        }
    return {
        'tau_dr': np.mean(res_tau_dr),
        'tau_ols': np.mean(res_tau_ols),
        'tau_ols_ps': np.mean(res_tau_ols_ps),
        'tau_resid': np.mean(res_tau_resid),
    }
Beispiel #12
0
def RF_imputation(df, fast=True):
    '''Returns the dataframe where missing values are imputed using Random Forest Imputation (sklearn)
    ExtraTreesRegressor is used for increased speed.

    Parameters:
    -----------
    df: pd.DataFrame
    fast: boolean, if set to True, ExtraTreesRegressor is used in preference of RandomForestRegressor

    Returns:
    --------
    df_result: pd.DataFrame where the missing values are imputed using Random Forest (MissForest)
    '''

    df_new = df.copy()

    df_new = make_missing_np_nan(df_new)

    missing, unique = imputation_heuristic_column(df, 0.99)
    df_new = delete_cols(df_new, missing)
    df_new = delete_cols(df_new, unique)

    #categorical and datetime columns cannot be imputed, so are removed from the imputation dataframe
    cat_cols, date_cols, num_cols = type_cols(df_new)
    df_new = df_new[num_cols]

    columns = df_new.columns

    if fast:
        imputer = IterativeImputer(random_state=0,
                                   estimator=ExtraTreesRegressor(
                                       n_estimators=10, random_state=0))
    else:
        imputer = IterativeImputer(random_state=0,
                                   estimator=RandomForestRegressor(
                                       n_estimators=10, random_state=0))

    imputed = imputer.fit_transform(df_new)
    df_imputed = pd.DataFrame(imputed, columns=columns)

    #categorical and datetime columns are added back
    not_imputed_cols = cat_cols + date_cols
    df_result = pd.concat([df_imputed, df[not_imputed_cols]], axis=1)

    return df_result
Beispiel #13
0
def embedding(file_path):
    kbs_mini = pd.read_csv(file_path, encoding='utf-8')
    del kbs_mini['end_date']
    del kbs_mini['pd']
    del kbs_mini['writer']
    del kbs_mini['actor1']
    del kbs_mini['actor2']
    del kbs_mini['actor3']
    del kbs_mini['actor4']
    del kbs_mini['actor5']
    del kbs_mini['avg_rate']
    del kbs_mini['rate_25']
    del kbs_mini['prev']

    start_timestamp = pd.to_datetime(kbs_mini['start_date'],
                                     format='%Y-%m-%d').astype(int) / 10**11
    kbs_mini['start_date'] = start_timestamp

    time_to_datetime = pd.to_datetime(kbs_mini['time'], format='%H:%M:%S')
    kbs_mini['time'] = time_to_datetime.dt.hour + (time_to_datetime.dt.minute /
                                                   60)

    day_one_enc = pd.get_dummies(kbs_mini, columns=['day'])
    kbs_mini = day_one_enc

    # # print(kbs_mini['kbs'])
    kbs_mini = pd.get_dummies(kbs_mini, columns=['kbs'])

    del kbs_mini['title']

    # column_names = kbs_mini.columns.values.tolist()
    imp_mean = IterativeImputer(missing_values=np.nan,
                                skip_complete=True,
                                random_state=0)
    imputed_prev_25 = imp_mean.fit_transform(kbs_mini.to_numpy())[:, 4]
    kbs_mini['prev_25_imputed'] = imputed_prev_25
    # kbs_mini = pd.DataFrame(imp_mean.fit_transform(kbs_mini.to_numpy()), columns=column_names)
    del kbs_mini['prev_25']

    # print(kbs_mini.loc[:,['prev_25','prev_25_imputed']])
    return kbs_mini
    # print(kbs_mini)


# embedding('../kbs_mini.csv')
Beispiel #14
0
def get_results_single_imputation(X_train, X_test, y_train, y_test):
    # Apply imputation
    imputer = IterativeImputer(n_iter=100, sample_posterior=True,
                               random_state=0)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Standardize data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    # Perform estimation and prediction
    estimator = LinearRegression()
    estimator.fit(X_train_scaled, y_train)
    y_predict = estimator.predict(X_test_scaled)
    mse_single = mse(y_test, y_predict)

    return mse_single
Beispiel #15
0
def missingValues(data, opt='check', **kwargs):
    n, m = data.shape
    # Check for missing values
    data = data.replace([np.inf, -np.inf], np.nan)
    if opt == 'check':
        missing = data.isna().sum().sum()
        print("Missing values: ", round(missing * 100 / (n * m), 2), "%  (",
              missing, "/", n * m, ")")
    else:
        # Choose method
        if opt == 'mean':
            out = data.fillna(data.mean())
        elif opt == 'median':
            out = data.fillna(data.median())
        else:
            # Iterative imputers
            from sklearn.experimental import enable_iterative_imputer
            from sklearn.impute import IterativeImputer
            if opt == 'bayesian':
                from sklearn.linear_model import BayesianRidge
                estim = BayesianRidge(n_iter=100, **kwargs)
            elif opt == 'extra':
                from sklearn.ensemble import ExtraTreesRegressor
                estim = ExtraTreesRegressor(n_estimators=50,
                                            max_features=0.5,
                                            min_impurity_decrease=1e-3,
                                            min_samples_split=5,
                                            min_samples_leaf=2,
                                            n_jobs=-1,
                                            **kwargs)
            elif opt == 'knn':
                from sklearn.neighbors import KNeighborsRegressor
                estim = KNeighborsRegressor(n_jobs=-1, **kwargs)
            imp = IterativeImputer(estimator=estim,
                                   max_iter=5,
                                   n_nearest_features=100,
                                   verbose=2,
                                   random_state=0)
            out = pd.DataFrame(imp.fit_transform(data),
                               columns=data.columns,
                               index=data.index)
        print("Missing values imputed using", opt, "method!")
        return out
Beispiel #16
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #17
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
def iterative_imputer(pd_data, random_state=None):
    """
    Impute missing values using the multivariate imputer
    that estimates each feature from all the others.

    Inputs:
        pd_data: (DataFrame) Data containing missing values.
        random_state: (int, optional) Seed of the pseudo
            random number generator to use.

    Returns:
        pd_imputed: (DataFrame) Data with missing values imputed.
    """
    imputer = IterativeImputer(random_state=random_state)

    pd_imputed = pd.DataFrame(imputer.fit_transform(pd_data),
                              index=pd_data.index,
                              columns=pd_data.columns)

    return pd_imputed
Beispiel #19
0
def impute_data(data, weekly=False):
    dat = data.copy()
    if weekly:
        dat = dat.groupby(['fips', pd.Grouper(key='date',
                                              freq='W')]).aggregate('mean')
    dat = dat.loc[:, ~(
        dat.columns.str.startswith('smoothed_mean')
        | dat.columns.str.startswith('mean_') | dat.columns.
        isin(['Unnamed: 0', 'n', 'pct_avoid_contact_all_or_most_time']))]

    keep_columns = dat[pandas_select.StartsWith('pct_') | pandas_select.StartsWith('Total households!!') |
                                          pandas_select.StartsWith('RELATIONSHIP!!') |
                                          pandas_select.StartsWith('SCHOOL ENROLLMENT')].columns.to_list() + \
    dat.columns[dat.columns.get_loc("Civilian_labor_force_2018"):dat.columns.get_loc("Median_Household_Income_2018")+1].to_list() +\
    dat.columns[dat.columns.get_loc("Total_Male"):dat.columns.get_loc("Total households")+1].to_list() +\
    ["prop_cum_deaths","prop_cum_cases"]

    dat_selected = dat[keep_columns]

    min_values = np.concatenate([
        np.repeat(0, 88),
        np.repeat(-np.inf, dat_selected.shape[1] - 90),
        np.repeat(0, 2)
    ])
    max_values = np.concatenate([
        np.repeat(100, 88),
        np.repeat(np.inf, dat_selected.shape[1] - 90),
        np.repeat(100, 2)
    ])

    imp = IterativeImputer(max_iter=10,
                           random_state=100620,
                           sample_posterior=True,
                           min_value=min_values,
                           max_value=max_values)

    imputed_features = imp.fit_transform(dat_selected)

    imputed_df = dat_selected.copy()
    imputed_df.loc[:, keep_columns] = imputed_features
    return imputed_df
Beispiel #20
0
def imputing_nan(df):
    """Perform multiple imputations

        Args:
            df (DataFrame): Source DataFrame

        Returns:
            dict_impute: Dictionary with multiple imputation techniques performed over DataFrame
        """

    # Store DataFrame structure
    df_columns = df.columns
    df_index = df.index

    # Identify string data columns
    str_cols = df.select_dtypes(include=[np.object]).columns
    rest_cols = [i for i in df_columns if i not in str_cols]

    # Soft-Impute
    df_soft = SoftImpute().fit_transform(df[rest_cols])

    # Restore DataFrame structure
    df_soft = pd.DataFrame(df_soft, columns=rest_cols, index=df_index)
    df_soft = pd.concat([df[str_cols], df_soft], axis=1)

    # Building Regressor for IterativeImputer
    rgr = KNeighborsRegressor(n_neighbors=5)

    # Create Imputer
    imp = IterativeImputer(estimator=rgr, random_state=1234)

    # Apply imputation through KNN to DataFrame
    df_knn = imp.fit_transform(df[rest_cols])

    # Restore DataFrame structure
    df_knn = pd.DataFrame(df_knn, columns=rest_cols, index=df_index)
    df_knn = pd.concat([df[str_cols], df_knn], axis=1)

    dict_impute = {'soft': df_soft, 'knn': df_knn}

    return dict_impute
Beispiel #21
0
def exp_mi(xmiss, w, y, regularize, m=10):

    res_tau_dr = []
    res_tau_ols = []
    res_tau_ols_ps = []
    res_tau_resid = []
    for i in range(m):
        imp = IterativeImputer(sample_posterior = True, random_state = i)
        x_imp_mice = imp.fit_transform(xmiss)
        tau_tmp = compute_estimates(x_imp_mice, w, y, regularize)
        res_tau_dr.append(tau_tmp['tau_dr'])
        res_tau_ols.append(tau_tmp['tau_ols'])
        res_tau_ols_ps.append(tau_tmp['tau_ols_ps'])
        res_tau_resid.append(tau_tmp['tau_resid'])

    return {
        'tau_dr': np.mean(res_tau_dr),
        'tau_ols': np.mean(res_tau_ols),
        'tau_ols_ps': np.mean(res_tau_ols_ps),
        'tau_resid': np.mean(res_tau_resid),
    }
def impute_BayesRegression(dataframe, df_missing, rnd_numbers_row,
                           rnd_numbers_column, error_i, m):
    imputed_value_temp = pd.DataFrame()
    imputed_value_list = []
    for i in range(m):
        imp_BR = IterativeImputer(tol=0.01,
                                  max_iter=10,
                                  sample_posterior=True,
                                  estimator=BayesianRidge(normalize=True,
                                                          alpha_1=0,
                                                          lambda_1=0.005))
        df_imputed = pd.DataFrame(imp_BR.fit_transform(df_missing))
        for k, row in enumerate(rnd_numbers_row):
            if k in error_i:
                pass
            else:
                imputed_value_list.append(
                    df_imputed.iloc[row, rnd_numbers_column[k]])
        imputed_value_temp[i] = imputed_value_list
        imputed_value_list = []
    df_imputed.columns = [df_missing.columns.tolist()]
    return df_imputed, imputed_value_temp
Beispiel #23
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
Beispiel #24
0
def __completef(fitinfo, compmethod=None):
    """Completes missing values in f using IterativeImpute from scikit-learn."""
    f = fitinfo['f']
    if compmethod == 'KNN':
        estimator = KNeighborsRegressor()
    elif compmethod == 'BayesianRidge':
        estimator = BayesianRidge()
    elif compmethod == 'RandomForest':
        estimator = RandomForestRegressor()
    else:
        raise ValueError('Specify completion method.')

    transformer = IterativeImputer(estimator=estimator)
    fhat = transformer.fit_transform(f)

    if not np.isfinite(fhat).all():
        raise ValueError(
            'Completion method (sklearn.impute.IterativeImputer) failed.')

    fitinfo['f'] = fhat
    fitinfo['completionmethod'] = 'IterativeImputer:{:s}'.format(compmethod)
    return
Beispiel #25
0
def process_repeated_measures(df):
    """
    Function to process repeated measures data for each participant
    ---------------------------------------------------------------

    input:      pd.DataFrame containing repeated measures for a single
                participant
    returns:    Reshaped, imputed and scaled data for participant

    Notes:
    -----
    Some participants have missing data in their repeated measures. We
    need to decide what to do with this. Options are (1) fill with zeros; (2)
    mean impute; (3) multivariate impute). I'm using (3) for now, using the
    experimental sklearn.impute.IterativeImputer. See[1].

    [1]: https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html
    """
    # Select columns
    df = df.drop('subjectid', axis=1)
    df = df.transpose()
    df.columns = ['value']
    df['variable'] = df.index
    df['week'] = df['variable'].str.extract(r'(\d+)$')
    df['measure'] = df['variable'].str.replace('\d+$', '')
    df.drop(['variable'], axis=1, inplace=True)
    # Reshape from LONG to WIDE
    df = df.pivot(index='week', values='value', columns='measure')
    # If missing all repeated measures, replace with zeros to allow loop to
    # continue
    if df.isnull().all().all():
        df = df.fillna(0)
    # Impute missing values
    mvi = IterativeImputer(sample_posterior=True)
    dfi = mvi.fit_transform(df)
    # Scale
    sc = StandardScaler()
    scaled = pd.DataFrame(sc.fit_transform(dfi))
    return (scaled)
Beispiel #26
0
def test_iterative_imputer_catch_warning():
    # check that we catch a RuntimeWarning due to a division by zero when a
    # feature is constant in the dataset
    X, y = load_boston(return_X_y=True)
    n_samples, n_features = X.shape

    # simulate that a feature only contain one category during fit
    X[:, 3] = 1

    # add some missing values
    rng = np.random.RandomState(0)
    missing_rate = 0.15
    for feat in range(n_features):
        sample_idx = rng.choice(np.arange(n_samples),
                                size=int(n_samples * missing_rate),
                                replace=False)
        X[sample_idx, feat] = np.nan

    imputer = IterativeImputer(n_nearest_features=5, sample_posterior=True)
    with pytest.warns(None) as record:
        X_fill = imputer.fit_transform(X, y)
    assert not record.list
    assert not np.any(np.isnan(X_fill))
def impute_xform(df):

    imp = IterativeImputer(missing_values=np.nan,
                           random_state=5,
                           max_iter=20,
                           add_indicator=True)
    imputed_arr = imp.fit_transform(df)
    nans = df.isna().sum()
    nan_labels = nans[nans > 0].index
    nan_labels = [col + '_nan' for col in nan_labels]

    encoded = list(df.columns)
    encoded.extend(nan_labels)
    features_imputed = pd.DataFrame(imputed_arr, columns=encoded)

    skewed = [
        'ScreenPorch', 'PoolArea', 'LotFrontage', '3SsnPorch', 'LowQualFinSF'
    ]
    features_log_xformed = pd.DataFrame(data=features_imputed)
    features_log_xformed[skewed] = features_imputed[skewed].apply(
        lambda x: np.log(x + 1))

    return features_log_xformed
Beispiel #28
0
def clean_df():
    print('Loading tract table')
    df = pd.read_csv(os.path.join(PRODUCT_GEO_PATH, 'tract_table.csv'))
    df['GEOID'] = df['GEOID'].astype(str).apply(lambda x: x.zfill(11))
    df = df.set_index('GEOID')
    geoid_to_city = df.city.to_dict()

    df = df[[c for c in df.columns if '-M-' not in c]]

    print('Reformatting tract table')
    year_dfs = []
    for year in ACS_TIME_COVERAGE:
        year_df = df[[c for c in df.columns if c.split('-')[0] == str(year)]]
        year_df = year_df.rename(
            columns={c: '-'.join(c.split('-')[1:])
                     for c in year_df.columns})
        year_df['year'] = year
        year_df = year_df.set_index('year', append=True)
        year_dfs.append(year_df)

    gydf = pd.concat(year_dfs)
    gydf['city'] = gydf.apply(lambda x: geoid_to_city[x.name[0]], axis=1)
    gydf = gydf.set_index('city', append=True)

    gydf = gydf.drop(columns=drop_columns)

    print('Imputing missing values')
    imp_mean = IterativeImputer(random_state=0,
                                min_value=0,
                                skip_complete=True,
                                imputation_order='random')
    X = imp_mean.fit_transform(gydf.values)

    cdf = copy.deepcopy(gydf)
    cdf.iloc[:, :] = X

    return cdf, geoid_to_city
Beispiel #29
0
    def fit(self, n=0, seed=None):
        """
        Parameters
        ----------
        n : int
            Number of block bootstrap replicates
        """
        if seed is not None:
            # add a large constitent, so incrementing seed +1 will work.
            seed = seed + 1234567:


        imp = IterativeImputer(**self._imputer_kwargs,
                               random_state=seed)
        col = 1
        rows = self.data.shape[0]
        temp = imp(self.data)
        self.result = temp[:,col]

        if n != 0:
            self.boot_result = np.zeros([rows, n])

        for i in range(n):
            random_state = seed + n

            imp = IterativeImputer(**self._imputer_kwargs,
                                   random_state=random_state)

            # get block bootstrap sample
            boot_sample = bbs_replicate(seed=random_state)

            # create temp dataset including sample
            imputed_sample = imp.fit_transform(boot_sample)

            # impute the result
            self.boot_result[:,n] = imputed_sample[:,self.target_col]
Beispiel #30
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(imputer.transform(X) ==
                      imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
if __name__ == "__main__":
    train_dir = '/opt/ml/processing/input/train'
    test_dir = '/opt/ml/processing/input/test'
    seed = 0
    train_df = pd.read_csv(os.path.join(train_dir, 'train.csv'),
                           index_col='ID')
    test_df = pd.read_csv(os.path.join(test_dir, 'test.csv'), index_col='ID')

    print('Scaling Data')
    std_scale = preprocessing.StandardScaler().fit(train_df.iloc[:, 1:])
    train_df_scaled = std_scale.transform(train_df.iloc[:, 1:])
    test_df_scaled = std_scale.transform(test_df)

    print('Training Data Imputation Model')
    imputer = IterativeImputer(random_state=seed, missing_values=0)
    train_imputed = imputer.fit_transform(train_df_scaled)

    # Transforming test data
    test_imputed = imputer.transform(test_df_scaled)

    train_imputed_output_path = os.path.join('/opt/ml/processing/train',
                                             'train_imputed.csv')
    test_imputed_output_path = os.path.join('/opt/ml/processing/test',
                                            'test_imputed.csv')

    pd.concat([
        train_df['target'],
        pd.DataFrame(
            train_imputed, columns=train_df.columns, index=train_df.index)
    ],
              axis=1).to_csv(train_imputed_output_path,
Beispiel #32
0
def training_data_pipeline():
	"""
		This function fetches and transforms the data needed for training the models. 
	"""
	# Fetch data
	print(f'[{datetime.now(TZ_MKD).strftime("%Y-%m-%d %H:%M:%S")}]\tFetching historical data for training')
	fetch_historical_data(date_start=TZ_MKD.localize(datetime(2011, 1, 1, 0, 0, 0)), date_end=datetime.now(TZ_MKD), 
						  pipeline_type=PIPELINE_TRAINING)

	# Transform data
	for station, pollutants in SENSORS.items(): 
		print(f'[{datetime.now(TZ_MKD).strftime("%Y-%m-%d %H:%M:%S")}]\tProcessing training data for {station}')
		df = pd.read_csv(f'./data/training/first-order/{station}', index_col=0)

		# add feature indicating missingness for each pollutant
		for p in pollutants:
			df[f'{p}_missing'] = df[f'{p}'].isna().astype('int32')

		# log-transform pollutants
		for p in pollutants:
			df[p] = np.log(df[p] + 1)

		# train-val split
		train_size = int(df.shape[0] * 0.85)
		df_train = df.iloc[:train_size]
		df_valid = df.iloc[train_size:]

		# fit and save scalers
		features_to_normalize = ['cloud_cover', 'precip', 'uv_index', 'visibility']
		features_to_standardize =  pollutants + ['temperature', 'humidity', 'dew_point',
												'pressure', 'wind_speed']
		scalers = {}
		for f in features_to_normalize:
			scaler = MinMaxScaler()
			scaler.fit(df_train[f].values.reshape(-1,1))
			scalers[f] = scaler

		for f in features_to_standardize:
			scaler = StandardScaler()
			scaler.fit(df_train[f].values.reshape(-1,1))
			scalers[f] = scaler

		if not os.path.exists(f'./pickles/scalers/{station}'):
			os.makedirs(f'./pickles/scalers/{station}')

		for feature, scaler in scalers.items():
			with open(f'./pickles/scalers/{station}/{feature}', 'wb') as f:
				pickle.dump(scaler, f)

		df_train_scaled = scale_data(df_train, station)
		df_valid_scaled = scale_data(df_valid, station)
		train_values = df_train_scaled.values.copy()
		valid_values = df_valid_scaled.copy()

		# impute missing values
		imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=12, random_state=0), 
								   random_state=0, skip_complete=True, max_iter=5)

		imputed_train_values = imputer.fit_transform(train_values)
		imputed_valid_values = imputer.transform(valid_values)

		if not os.path.exists(f'./pickles/imputers'):
			os.makedirs(f'./pickles/imputers')

		with open(f'./pickles/imputers/{station}', 'wb') as f:
			pickle.dump(imputer, f)

		df_train_imputed = pd.DataFrame(data=imputed_train_values, 
								index=df_train_scaled.index,
								columns=df_train_scaled.columns)
		
		df_valid_imputed = pd.DataFrame(data=imputed_valid_values, 
										index=df_valid_scaled.index,
										columns=df_valid_scaled.columns)

		if not os.path.exists(f'./data/training/second-order/{station}'):
			os.makedirs(f'./data/training/second-order/{station}')

		df_train_imputed.to_csv(f'./data/training/second-order/{station}/train', index=True)
		df_valid_imputed.to_csv(f'./data/training/second-order/{station}/valid', index=True)

		# build seq2seq (third-order) datasets
		train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = \
			build_seq2seq_datasets(df_train_imputed, pollutants)

		valid_encoder_input_data, valid_decoder_input_data, valid_decoder_target_data = \
			build_seq2seq_datasets(df_valid_imputed, pollutants)

		if not os.path.exists(f'./data/training/third-order/{station}'):
			os.makedirs(f'./data/training/third-order/{station}')

		np.save(f'./data/training/third-order/{station}/train_encoder_input_data.npy', train_encoder_input_data)
		np.save(f'./data/training/third-order/{station}/train_decoder_input_data.npy', train_decoder_input_data)
		np.save(f'./data/training/third-order/{station}/train_decoder_target_data.npy', train_decoder_target_data)

		np.save(f'./data/training/third-order/{station}/valid_encoder_input_data.npy', valid_encoder_input_data)
		np.save(f'./data/training/third-order/{station}/valid_decoder_input_data.npy', valid_decoder_input_data)
		np.save(f'./data/training/third-order/{station}/valid_decoder_target_data.npy', valid_decoder_target_data)
Beispiel #33
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
def data_preprocessing(dat: pd.DataFrame,
                       art='C',
                       y=None,
                       logger=None,
                       remove=True):
    """
    Encoding + remove columns with more than 1/2 na if remove==True + remove columns with all na + imputation
    if art == 'C', will do LabelEncoding first for the target column
    ================
    Parameter:
    ================
    dat - type of DataFrame
    art - type of string
        either C for classifcation of R for regression. indicates the type of problem 
    y - type of string
        the name of the target column; if None, set the last column of the data set as target
        considering only one column for label
    logger - type of Logger
    remove - type of boolean
        whether remove the columns with na value more than half length or not
    =================
    Output
    =================
    dat - type of Dataframe 
        the dataframe after preprocessing
    cols - type of list of string
        the name of the numerical columns
    """
    if logger == None:
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        logger = logging.getLogger(__name__)

    logger.info('Start data preprocessing')
    # replace original indeices with default ones
    dat = dat.reset_index(drop=True)

    if art == 'C':
        logger.info('Start to label target feature y for classification task')
        dat.iloc[:, -1] = LabelEncoder().fit_transform(dat.iloc[:, -1])
        logger.info('End with label encoding the target feature')
    if remove:
        # remove columns with more than 1/2 na
        dat = dat.loc[:, dat.isna().sum() / len(dat) < .5]
        logger.info(
            'Following features are removed from the dataframe because half of their value are NA: %s'
            % (dat.columns[dat.isna().sum() / len(dat) > .5].to_list()))
    # Encoding
    oe = OneHotEncoder(drop='first')
    # get categorical columns
    if y:
        dat_y = dat[[y]]
        cols = dat.columns.to_list()
        cols.remove(y)
        dat_x = dat[cols]
    else:
        dat_y = dat[[dat.columns[-1]]]
        dat_x = dat[dat.columns[:-1]]
    dat_categ = dat_x.select_dtypes(include=['object'])
    # get kterm of categ features
    for i in dat_categ.columns:
        # save output to dat
        tmp = dat_x[i].value_counts()
        dat_x[i + '_kterm'] = dat_x[i].map(lambda x: tmp[x]
                                           if x in tmp.index else 0)
    # float columns including the k term cols
    dat_numeric = dat_x.select_dtypes(
        include=['float32', 'float64', 'int32', 'int64'])
    # onehot encoding and label encoding
    dat_categ_onehot = dat_categ.iloc[:,
                                      dat_categ.apply(lambda x: len(x.unique())
                                                      ).values < 8]
    dat_categ_label = dat_categ.iloc[:,
                                     dat_categ.apply(lambda x: len(x.unique())
                                                     ).values >= 8]
    flag_onehot = False
    flag_label = False
    # oe
    if dat_categ_onehot.shape[1] > 0:
        logger.info(
            'Start to do onehot to the following categoric features: %s' %
            (str(dat_categ_onehot.columns.to_list())))
        dat_onehot = pd.DataFrame(
            oe.fit_transform(dat_categ_onehot.astype(str)).toarray(),
            columns=oe.get_feature_names(dat_categ_onehot.columns))
        logger.info('End with onehot')
        flag_onehot = True
    else:
        dat_onehot = None
    # le
    if dat_categ_label.shape[1] > 0:
        logger.info(
            'Start to do label encoding to the following categoric features: %s'
            % (str(dat_categ_label.columns.to_list())))
        dat_categ_label = dat_categ_label.fillna('NULL')
        dat_label = pd.DataFrame(columns=dat_categ_label.columns)
        for i in dat_categ_label.columns:
            dat_label[i] = LabelEncoder().fit_transform(
                dat_categ_label[i].astype(str))
        flag_label = True
        logger.info('End with label encoding')
    else:
        dat_label = None
    # scaling
    # combine
    dat_new = pd.DataFrame()
    if flag_onehot and flag_label:
        dat_new = pd.concat([dat_numeric, dat_onehot, dat_label], axis=1)
    elif flag_onehot:
        dat_new = pd.concat([dat_numeric, dat_onehot], axis=1)
    elif flag_label:
        dat_new = pd.concat([dat_numeric, dat_label], axis=1)
    else:
        dat_new = dat_numeric
    dat_new = pd.concat([dat_new, dat_y], axis=1)
    # imputation
    dat_new = dat_new.dropna(axis=1, how='all')
    if dat_new.isna().sum().sum() > 0:
        logger.info(
            'Nan value exist, start to fill na with iterative imputer: ' +
            str(dat_new.isna().sum().sum()))
        # include na value, impute with iterative Imputer or simple imputer
        columns = dat_new.columns
        imp = IterativeImputer(max_iter=10, random_state=0)
        # imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        dat_new = imp.fit_transform(dat_new)
        dat_new = pd.DataFrame(dat_new, columns=columns)
    dat_numeric = dat_new.iloc[:, :-1].select_dtypes(
        include=['float32', 'float64', 'int32', 'int64'])
    logger.info('End with filling nan')
    return dat_new, dat_numeric.columns
Beispiel #35
0
    print('Iteration', i + 1)

    # ### Split Data

    X_train, X_test, y_train, y_test = train_test_split(
        df.values,
        labels.values.ravel(),
        train_size=train_size,
        shuffle=True,
        stratify=labels.values.ravel())

    # ### Impute Data
    if data_impute:
        imp = IterativeImputer(max_iter=25, random_state=1337)

        X_train = imp.fit_transform(X_train)
        X_test = imp.transform(X_test)

    # ### Augment Data
    if smote_ratio > 0:
        smote = SMOTE(sampling_strategy='all',
                      random_state=1337,
                      k_neighbors=5,
                      n_jobs=1)

        X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
Beispiel #36
0
                                                   smooth, hpass)]['topology'])
                    vect_all.append(np.concatenate(vects, axis=1))
                    del vects
                X_top = np.swapaxes(np.hstack(vect_all), 0, 1)

                Y = np.array(id_list)
                try:
                    df_summary.at[i, 'grid'] = (atlas, est, clust, _k, smooth,
                                                hpass)
                    bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))]
                    for m in set(bad_ixs):
                        if (X_top.shape[0] -
                                bad_ixs.count(m)) / X_top.shape[0] < 0.50:
                            X_top = np.delete(X_top, m, axis=1)
                    imp = IterativeImputer(max_iter=50, random_state=42)
                    X_top = imp.fit_transform(X_top)
                    scaler = StandardScaler()
                    X_top = scaler.fit_transform(X_top)
                    discr_stat_val, rdf = discr_stat(X_top, Y)
                    df_summary.at[i, 'discriminability'] = discr_stat_val
                    print(discr_stat_val)
                    #print(rdf)
                    del discr_stat_val
                    i += 1
                except:
                    i += 1
                    continue
    elif modality == 'dwi':
        gen_hyperparams = ['est', 'clust', '_k']
        for col in cols:
            build_hp_dict(col,
def por_facies_imputer(dataframe):
    """
    Imputes missing porosity and facie labels using KNN 

    Args:
        df ([DataFrame]): The dataframe should includes the 
        following columns: ['X', 'Y', 'depth', 'por', 'rho','facies']
    Returns:
        df ([DataFrame])
    """
    df_original = dataframe.copy(deep=False)

    df = df_original.loc[:, ['X', 'Y', 'depth', 'por', 'rho', 'facies']]
    categorical = ['facies']
    numerical = ['X', 'Y', 'depth', 'por', 'rho']

    df['Imputed'] = (df.isnull().sum(axis=1)) > 0

    df[categorical] = df[categorical].apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index))

    # Instatiate imputers
    imp_num = IterativeImputer(estimator=RandomForestRegressor(),
                               initial_strategy='mean',
                               max_iter=20,
                               random_state=0)

    imp_cat = IterativeImputer(estimator=RandomForestClassifier(),
                               initial_strategy='most_frequent',
                               max_iter=20,
                               random_state=0)

    # Fit
    df[numerical] = imp_num.fit_transform(df[numerical])
    df[categorical] = imp_cat.fit_transform(df[categorical])

    #Perform corrections to facies information with density and porosity values
    df['facies'] = np.where((df.por < 0.1) & (df.rho > 2.40), 1, df.facies)
    df['facies'] = np.where((df.por < 0.08) & (df.rho < 2.25), 2, df.facies)
    df['facies'] = np.where(
        (df.por < 0.13) & (df.por > 0.08) & (df.rho < 2.40), 3, df.facies)

    #Update por, rho and facies with the predicted values for missing data
    df_original["por"] = df["por"]
    df_original["rho"] = df["rho"]
    df_original["facies"] = df["facies"]

    facies_map = {0: 'SS', 1: 'SS-Sh', 2: 'Sh', 3: 'Sh-SS'}

    df_original["facies"] = df_original["facies"].map(facies_map)

    print('---------------------------------')
    print('Porosity initial missing values = ' +
          str(dataframe['por'].isna().sum()))
    print('Porosity final missing values = ' +
          str(df_original['por'].isna().sum()))
    print('Facies initial missing values = ' +
          str(dataframe['facies'].isna().sum()))
    print('Facies final missing values = ' +
          str(df_original['facies'].isna().sum()))
    print('---------------------------------')

    return df_original
Beispiel #38
0
################################################################################
# The remaining missings will be imputed via Iterative Imputer:
# Models each feature with missing values as a function of other features, and
# uses that estimate for imputation

# If we don't put the categorical values back later, maybe we can use some encoding?
X_train=train.drop(columns=Categorical,axis=1)
X_train.drop(columns='TARGET',axis=1,inplace=True)
X_test=test.drop(columns=Categorical,axis=1)

# Impute
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

filler=IterativeImputer()
X_train_filled = filler.fit_transform(X_train)
X_test_filled = filler.transform(X_test)

X_train_filled = pd.DataFrame(X_train_filled, columns=list(X_train))
X_test_filled = pd.DataFrame(X_test_filled, columns=list(X_test))

train=pd.concat([train[Categorical],X_train_filled,train['TARGET']],axis=1)
test=pd.concat([test[Categorical],X_test_filled],axis=1)

# Final check:
miss(train,1)
miss(test,1)

# # If we need to standardize data:
# from sklearn import preprocessing
# X_scaled = preprocessing.StandardScaler().fit_transform(X)
Beispiel #39
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
Beispiel #40
0
loans_imp_meanDF = pd.DataFrame(loans_imp_mean, columns=numeric_cols.columns)
print("\n\nDataframe info after imputation of numeric columns with mean")
# Check the DataFrame's info
print(loans_imp_meanDF.info())

##Impute with IterativeImputer
#https://scikit-learn.org/stable/modules/impute.html#iterative-imputer
#at each step, a feature column is designated as output y and the other feature
#columns are treated as inputs X. A regressor is fit on (X, y) for known y.
#Then, the regressor is used to predict the missing values of y. This is done
#for each feature in an iterative fashion, and then is repeated for max_iter imputation rounds.
# Iteratively impute
imp_iter = IterativeImputer(max_iter=5,
                            sample_posterior=True,
                            random_state=123)
loans_imp_iter = imp_iter.fit_transform(numeric_cols)
# Convert returned array to DataFrame
loans_imp_iterDF = pd.DataFrame(loans_imp_iter, columns=numeric_cols.columns)
# Check the DataFrame's info
print("\n\nDataframe info after iterative imputation of numeric columns")
print(loans_imp_iterDF.info())

########## Replace outliers - Winsorization
# Print: before dropping
print("\n\nDetect and replace outliers")
df = df_filled
print(df)
numeric_cols = df.select_dtypes(include=[np.number])
print(numeric_cols.mean())
print(numeric_cols.median())
print(numeric_cols.max())
Beispiel #41
0
# %%
medicamentos.columns

dum_reg = pd.get_dummies(medicamentos.REGIONAL_EPS_DESC)
dum_medicamento = dum_sign(medicamentos.NOMBRE_MEDICAMENTO, 0.01)
dum_diag = dum_sign(medicamentos.DIAGNOSTICO_EPS_DESC, 0.01)

aver = pd.concat([
    dum_reg, dum_medicamento, dum_diag,
    medicamentos.FECHA_EMISION.apply(lambda x: x.timestamp()),
    medicamentos.NUMERO_CANTIDAD_PRESTACIONES
],
                 axis=1)

medicamentos_imputed = IterativeImputer(random_state=141854)
sal = medicamentos_imputed.fit_transform(aver)

# %%
medicamentos["NumeroCantidadPrestacionesImputado"] = sal[:, -1]

# %%
medicamentos["NumeroCantidadPrestacionesImputadoInd"] = pd.isna(
    medicamentos.NUMERO_CANTIDAD_PRESTACIONES).astype(int)

# %%
medicamentos[medicamentos.NumeroCantidadPrestacionesImputadoInd == 1]

# %%
## El objetivo de estos script es crear los casos de observación de adherencia.
## La metodologia consiste en ver si existe otra entrega de medicación cercana al dia final
## para las entregas de medicacion (un delta de 5 dias es posible)
Beispiel #42
0
def load_both_data(project, metric):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis=1, how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name', 'commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0, item)
    understand_df = understand_df[cols_list]
    cols = understand_df.columns.tolist()
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    understand_df['Name'] = understand_df.Name.str.rsplit('.', 1).str[1]

    commit_guru_file_level_path = 'data/commit_guru_file/' + project + '.csv'
    commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path)
    commit_guru_file_level_df[
        'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"')
    commit_guru_file_level_df = commit_guru_file_level_df[
        commit_guru_file_level_df['file_name'].str.contains('.java')]
    commit_guru_file_level_df[
        'Name'] = commit_guru_file_level_df.file_name.str.rsplit(
            '/', 1).str[1].str.split('.').str[0].str.replace('/', '.')
    commit_guru_file_level_df = commit_guru_file_level_df.drop('file_name',
                                                               axis=1)

    df = understand_df.merge(commit_guru_file_level_df,
                             how='left',
                             on=['commit_hash', 'Name'])

    cols = df.columns.tolist()
    cols.remove('Bugs')
    cols.append('Bugs')
    df = df[cols]

    for item in ['Kind', 'Name', 'commit_hash']:
        if item in cols:
            df = df.drop(labels=[item], axis=1)


#     df.dropna(inplace=True)
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)

    y = df.Bugs
    X = df.drop('Bugs', axis=1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    imp_mean = IterativeImputer(random_state=0)
    X = imp_mean.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)

    if metric == 'process':
        X = X[[
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ]]
    elif metric == 'product':
        X = X.drop([
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ],
                   axis=1)
    else:
        X = X
    return X, y