Example #1
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
Example #2
0
def test_iterative_imputer_truncated_normal_posterior():
    #  test that the values that are imputed using `sample_posterior=True`
    #  with boundaries (`min_value` and `max_value` are not None) are drawn
    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
    #  note that starting from the wrong random seed will make this test fail
    #  because random sampling doesn't occur at all when the imputation
    #  is outside of the (min_value, max_value) range
    pytest.importorskip("scipy", minversion="0.17.0")
    rng = np.random.RandomState(42)

    X = rng.normal(size=(5, 5))
    X[0][0] = np.nan

    imputer = IterativeImputer(min_value=0,
                               max_value=0.5,
                               sample_posterior=True,
                               random_state=rng)

    imputer.fit_transform(X)
    # generate multiple imputations for the single missing value
    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])

    assert all(imputations >= 0)
    assert all(imputations <= 0.5)

    mu, sigma = imputations.mean(), imputations.std()
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    if sigma == 0:
        sigma += 1e-12
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    # we want to fail to reject null hypothesis
    # null hypothesis: distributions are the same
    assert ks_statistic < 0.2 or p_value > 0.1, \
        "The posterior does appear to be normal"
Example #3
0
def test_iterative_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10,
                               verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
Example #4
0
def MultiIterTrees(dataset):
    from sklearn.impute import IterativeImputer

    Dim = dataset['d']
    trainX = dataset['train_x']
    testX = dataset['test_x']
    trainM = dataset['train_m']
    testM = dataset['test_m']
    # Train_No = dataset['train_no']
    # Test_No = dataset['test_no']

    test_X = testX.copy()
    train_X = trainX.copy()

    train_X[trainM == 0] = np.nan
    test_X[testM == 0] = np.nan

    # Bayesian imputation
    etr_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0)

    etr_imp = IterativeImputer(random_state=0, estimator=etr_estimator)
    etr_imp.fit(train_X)

    imputed_X = etr_imp.transform(test_X)

    print('>>>ExtraTreesRegressor IterativeImputer result: \n')
    print(imputed_X)

    _all_rmse = compute_rmse(testX, imputed_X, testM)

    print('>>>all_rmse', _all_rmse)

    return _all_rmse
Example #5
0
def test_iterative_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i + j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10, verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
Example #6
0
def test_iterative_imputer_truncated_normal_posterior():
    #  test that the values that are imputed using `sample_posterior=True`
    #  with boundaries (`min_value` and `max_value` are not None) are drawn
    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
    #  note that starting from the wrong random seed will make this test fail
    #  because random sampling doesn't occur at all when the imputation
    #  is outside of the (min_value, max_value) range
    pytest.importorskip("scipy", minversion="0.17.0")
    rng = np.random.RandomState(42)

    X = rng.normal(size=(5, 5))
    X[0][0] = np.nan

    imputer = IterativeImputer(min_value=0,
                               max_value=0.5,
                               sample_posterior=True,
                               random_state=rng)

    imputer.fit_transform(X)
    # generate multiple imputations for the single missing value
    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])

    assert all(imputations >= 0)
    assert all(imputations <= 0.5)

    mu, sigma = imputations.mean(), imputations.std()
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    if sigma == 0:
        sigma += 1e-12
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    # we want to fail to reject null hypothesis
    # null hypothesis: distributions are the same
    assert ks_statistic < 0.2 or p_value > 0.1, \
        "The posterior does appear to be normal"
Example #7
0
def MultiIterBayesian(dataset):

    Dim = dataset['d']
    trainX = dataset['train_x']
    testX = dataset['test_x']
    trainM = dataset['train_m']
    testM = dataset['test_m']
    # Train_No = dataset['train_no']
    # Test_No = dataset['test_no']

    test_X = testX.copy()
    train_X = trainX.copy()

    train_X[trainM == 0] = np.nan
    test_X[testM == 0] = np.nan

    # Bayesian imputation
    br_estimator = BayesianRidge()

    by_imp = IterativeImputer(random_state=0, estimator=br_estimator)
    by_imp.fit(train_X)

    imputed_X = by_imp.transform(test_X)

    print('>>>BayesianRidge IterativeImputer result: \n')
    print(imputed_X)

    _all_rmse = compute_rmse(testX, imputed_X, testM)

    print('>>>all_rmse', _all_rmse)

    return _all_rmse
Example #8
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
Example #9
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
Example #10
0
def main():
    configs = json.load(
        open('MachineLearning/Models/LSTM/Configuration.json', 'r'))
    if not os.path.exists(configs['model']['save_dir']):
        os.makedirs(configs['model']['save_dir'])

    time_series = pd.read_csv(clustered_timeseries_path +
                              "TimeSeriesAggregatedClusteredDeltaTwoDays.csv")
    print(time_series.shape)
    # configs['data']['train_test_split'],  #the split
    #configs['data']['columns_dynamic'] # the columns

    #Impute and Scale Data

    dynamic_features = configs['data']['dynamic_columns']
    grouping = configs['data']['grouping']
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(time_series[dynamic_features])
    time_series[dynamic_features] = imp.transform(
        time_series[dynamic_features])
    time_series = scale(time_series, dynamic_features)

    X = time_series[dynamic_features]
    groups = np.array(time_series[grouping])

    for outcome in configs['data']['classification_outcome']:
        y = time_series[outcome]
        y = y.astype(int)

        model = Model(configs['model']['name'] + outcome)

        print(grouping)
        print(len(set(time_series[grouping])))

        model.build_model(configs)

        i = 0
        for ffold_ind, (training_ind, testing_ind) in enumerate(
                stratified_group_k_fold(X, y, groups,
                                        k=10)):  # CROSS-VALIDATION
            training_groups, testing_groups = groups[training_ind], groups[
                testing_ind]
            this_y_train, this_y_val = y[training_ind], y[testing_ind]
            this_X_train, this_X_val = X.iloc[training_ind], X.iloc[
                testing_ind]

            assert len(set(training_groups) & set(testing_groups)) == 0

            print(" X SHAPE: ", this_X_train.shape)
            print(" Y shape: ", this_y_train.shape)

            input_timesteps = 24
            input_dim = 2

            if i == 0:
                #(NumberOfExamples, TimeSteps, FeaturesPerStep).
                model.train((this_X_train.values).reshape(-1, 24, 35),
                            (this_y_train.values).reshape(-1, 24, 1))
                i = i + 1
Example #11
0
def problem2_3_3(data):
    data[3].loc[data[3] == 0] = np.nan
    imp = IterativeImputer(missing_values=np.nan)
    imp.fit(data)
    newdata = np.round(imp.transform(data))
    area = newdata[:, 2].tolist()
    print("Use Multivariate:", problem2_3_1(area))
    return "as shown in the plots"
Example #12
0
 def iterative_inputer_integer(self, df):
     df_copy = df.copy()
     imp = IterativeImputer(max_iter=10, random_state=0)
     imp.fit(df_copy)
     df_new = pd.DataFrame(np.round(imp.transform(df_copy)),
                           columns=df_copy.columns)
     df_new = df_new.astype('int32')
     return df_new
Example #13
0
def impute_integrated_dataset(integrated):
    imputer = IterativeImputer(random_state=0)
    data = integrated.select_dtypes(exclude="object")
    imputer = imputer.fit(data)
    t_data = imputer.transform(data)
    integrated[data.columns] = t_data
    integrated = integrated.drop(columns=["Name", "Location"])
    integrated["Year"] = integrated["Year"].astype(int)
    return integrated
Example #14
0
def main():
    df = get_raw_data()
    data_dict = pd.read_csv("data/WiDS Datathon 2020 Dictionary.csv")

    identifier_features = data_dict[data_dict["Category"] == "identifier"][
        "Variable Name"].tolist() + ["icu_id"]
    type__features = [
        "hospital_admit_source",
        "icu_admit_source",
        "icu_stay_type",
        "icu_type",
    ]
    redundant_features = ['readmission_status', 'apache_2_bodysystem']
    features_to_drop = identifier_features + type__features + redundant_features

    # keep features that have less than 70% of nulls
    cut_off_percentage = 0.3
    n_of_nulls = int(cut_off_percentage * df.shape[0])
    df = df.dropna(axis=1, thresh=n_of_nulls)

    numeric_features = data_dict[
        data_dict["Data Type"] == "numeric"]["Variable Name"].tolist() + [
            "bmi", "apache_2_diagnosis", "apache_3j_diagnosis"
        ]

    skewed_numeric_features = df.columns[df.columns.isin(numeric_features)]
    numeric_df = df[skewed_numeric_features]

    imp = IterativeImputer(max_iter=3, verbose=0)
    imp.fit(numeric_df)
    imputed_df = imp.transform(numeric_df)
    imputed_df = pd.DataFrame(imputed_df, columns=numeric_df.columns)

    categorical_features = data_dict[
        data_dict["Data Type"] != "numeric"]["Variable Name"].tolist()

    # remove ['bmi','apache_2_diagnosis','apache_3j_diagnosis'] non_categorical features
    categorical_features = [
        feature for feature in categorical_features
        if feature not in ["bmi", "apache_2_diagnosis", "apache_3j_diagnosis"]
    ]

    skewed_categorical_features = df.columns[df.columns.isin(
        categorical_features)]

    categorical_df = df[skewed_categorical_features]

    # fill the null with the most occurred values

    # df.series.mode() returns a series. so [0] exact value of the series
    for feature in skewed_categorical_features:
        categorical_df[feature].fillna(categorical_df[feature].mode()[0],
                                       inplace=True)

    complete_df = pd.concat([imputed_df, categorical_df], axis=1)

    return complete_df
Example #15
0
def split_data(X, y):
    in_test = X["year"] >= START_TEST_YEAR

    y_train = y[~in_test]
    y_test = y[in_test]

    # There are a whole bunch of missing label data for some countries such that we
    # can't even forward- or backfill them. Making up your own labels is bad, but
    # we need some form of data and don't have time to find sources for what's missing
    imputer = IterativeImputer(max_iter=100)
    imputer.fit(y_train)

    return (
        X[~in_test],
        X[in_test],
        imputer.transform(y_train),
        imputer.transform(y_test),
    )
def imput_data_with_sklearn_imputer(df_daily):

    df_daily_interp = df_daily.copy()
    df_daily_interp["MES"] = df_daily_interp.index.month
    imputer = IterativeImputer(estimator=BayesianRidge(), random_state=1)
    imputer.fit(df_daily_interp.values)
    imputted_vals = imputer.transform(df_daily_interp.values)
    df_daily_interp.loc[:, :] = imputted_vals
    return df_daily_interp
Example #17
0
def experiment_LinearRegression(df, df_full, score):
    start_time = time.time()
    imp = IterativeImputer(estimator=LinearRegression(),
                           random_state=0,
                           max_iter=10)
    imp.fit(df)
    df_filled = pd.DataFrame(imp.transform(df))
    score.loc['Linear Regression', 'r2_score'] = r2_score(df_full, df_filled)
    score.loc['Linear Regression', 'time'] = time.time() - start_time
Example #18
0
def fill_chunk(fit_df, transform_df):
    estimator = RandomForestRegressor(n_estimators=10, n_jobs=8)
    imp = IterativeImputer(estimator=estimator, max_iter=5, random_state=0)
    imp.fit(fit_df)
    transformed = imp.transform(transform_df)
    imputed_df = pd.DataFrame(data=transformed,
                              index=transform_df.index,
                              columns=transform_df.columns)
    return imputed_df
Example #19
0
def use_imputation(df_list, train_x_columns):
    imputer = IterativeImputer(random_state=0, max_iter=30, verbose=2)
    imputer.fit(df_list[0][train_x_columns])

    for i in range(len(df_list)):
        df_list[i][train_x_columns] = imputer.transform(
            df_list[i][train_x_columns])

    return df_list
Example #20
0
def smooth_pert_corr():

    res = pd.read_csv('/work/GLEAM/perturbation_correction_v2/result.csv', index_col=0)

    gpis_valid = get_valid_gpis(latmin=24., latmax=51., lonmin=-128., lonmax=-64.)
    ind_valid = np.unravel_index(gpis_valid, (720, 1440))

    imp = IterativeImputer(max_iter=10, random_state=0)
    ind = np.unravel_index(res.index.values, (720, 1440))
    for tag in ['a1', 'b1', 'c1','a2', 'b2', 'c2']:

        img = np.full((720, 1440), np.nan)
        img[ind] = res[tag]

        # find all non-zero values
        idx = np.where(~np.isnan(img))
        vmin, vmax = np.percentile(img[idx], [2.5, 97.5])
        img[img < vmin] = vmin
        img[img > vmax] = vmax

        # calculate fitting parameters
        imp.set_params(min_value=vmin, max_value=vmax)
        imp.fit(img)

        # Define an anchor pixel to infer fitted image dimensions
        tmp_img = img.copy()
        tmp_img[idx[0][100], idx[1][100]] = 1000000

        # transform image with and without anchor pixel
        tmp_img_fitted = imp.transform(tmp_img)
        img_fitted = imp.transform(img)

        # # Get indices of fitted image
        idx_anchor = np.where(tmp_img_fitted == 1000000)[1][0]
        start = idx[1][100] - idx_anchor
        end = start + img_fitted.shape[1]

        # write output
        img[:, start:end] = img_fitted
        img = gaussian_filter(img, sigma=0.6, truncate=1)

        res.loc[:, tag + '_s'] = img[ind_valid]

    res.to_csv('/work/GLEAM/perturbation_correction_v2/result_smoothed.csv', float_format='%.8f')
Example #21
0
    def fit(self, X, y=None):
        """Perform co-clustering.

        Parameters
        ----------
        X : numpy array or scipy sparse matrix, shape=(n_samples, n_features)
            Matrix to be analyzed
        """
        random_state = check_random_state(self.random_state)

        check_array(X, accept_sparse=True, dtype="numeric", order=None,
                    copy=False, force_all_finite=True, ensure_2d=True,
                    allow_nd=False, ensure_min_samples=self.n_row_clusters,
                    ensure_min_features=self.n_col_clusters,
                    warn_on_dtype=False, estimator=None)
        
        global indices 
        indices=np.argwhere(np.isnan(X))
        if(len(indices)):
            imp = IterativeImputer(missing_values=np.nan, sample_posterior=False, 
                                 max_iter=10, tol=0.001, 
                                 n_nearest_features=4, initial_strategy='most_frequent')
            imp.fit(X)
            X=imp.transform(X)   
        check_positive(X)

        X = X.astype(float)

        criterion = self.criterion
        criterions = self.criterions
        row_labels_ = self.row_labels_
        column_labels_ = self.column_labels_
        delta_kl_ = self.delta_kl_

        seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init)
        for seed in seeds:
            self._fit_single(X, seed, y)
            if np.isnan(self.criterion):
                raise ValueError("matrix may contain negative or "
                                 "unexpected NaN values")
            # remember attributes corresponding to the best criterion
            if (self.criterion > criterion):
                criterion = self.criterion
                criterions = self.criterions
                row_labels_ = self.row_labels_
                column_labels_ = self.column_labels_
                delta_kl_ = self.delta_kl_

        # update attributes
        self.criterion = criterion
        self.criterions = criterions
        self.row_labels_ = row_labels_
        self.column_labels_ = column_labels_
        self.delta_kl_ = delta_kl_

        return self
Example #22
0
def train_model_iterative_fill(filename):
    pd.options.mode.chained_assignment = None

    df = pd.read_csv(filename, encoding = 'utf-16', sep = '\t')
    groups = list(set(df[PAGAL_KA_SUGRUPUOTI_SPEJIMUS].astype(int)))

    estimators = [ExtraTreesRegressor(), BayesianRidge(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor()] # geriausiai veikia decision trees regressor
    # pasirenkamas algoritmas
    estimator = estimators[0]

    # ar reikia atmesti mažas reikšmes
    new_filename = filename
    atmesti_mazas_reiksmes = True
    if atmesti_mazas_reiksmes:
        df = atmesti_mazas_tui(df)
        new_filename = filename.split('.')
        new_filename = new_filename[0] + '_be_mazu_tui.' + new_filename[1]
        df.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False)

    for group in groups:

        print('Pildomas %s rodiklis' % group)
        maindf = pd.read_csv(new_filename, encoding = 'utf-16', sep = '\t')

        # atsirenkamos eilutės tik su tam tikra PAGAL_KA_SUGRUPUOTI_SPEJIMUS reikšme
        df = maindf.loc[maindf[PAGAL_KA_SUGRUPUOTI_SPEJIMUS] == group]
        X = shuffle(df)

        # numetamos reikšmės, kurių neina konvertuoti į skaičius
        for name in ATMESTI:
            X = X.drop(name, axis = 1)

        # atsikratome tuščių stulpelių (neįmanoma teisingai nuspėti kai nėra jokio pavyzdžio)
        for col in X:
            if X[col].isnull().all():
                X = X.drop(col, axis = 1)
        
        # jei yra bent viena eilutė, kurią būtų galima užpildyti
        if len(X) > 0:
            index = list(X.index)
            columns = list(X.columns.values)
            # sukuriamas ir ištreniruojamas algoritmas
            imp = IterativeImputer(estimator = estimator, missing_values = np.nan)
            imp.fit(X)
            # užpildomos tuščios X reikšmės
            X = imp.transform(X) # čia X grąžinamas np.array pavidalu, todėl reikia jį atversti atgal į pandas.DataFrame
            X = pd.DataFrame(data = X, index = index, columns = columns)
            maindf.update(X)

            new_filename = new_filename.split('.')[0] + '_updated.' + new_filename.split('.')[1]
            # išsaugomi spėjimai
            maindf.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False)

    # sutvarko failą
    tidy_up_file(new_filename)
    return 0
Example #23
0
def imputeAll(df, write=''):
    ''' impute all of the columns in the DF apart from URN '''
    import numpy as np
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    URNcol = df['URN'][:]
    originalCols = list(df.columns)
    originalCols.remove('URN')
    print('len(originalCols) after removing URN', len(originalCols))
    dfToFit = df.drop(['URN'], axis=1)
    print('dfToFit.shape', dfToFit.shape)
    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(dfToFit)
    print('imp.transform(dfToFit).shape', imp.transform(dfToFit).shape)
    fixed_df = pd.DataFrame(imp.transform(dfToFit), columns=originalCols)
    fixed_df['URN'] = URNcol
    if len(write) > 0:
        fixed_df.to_csv(write)
    return fixed_df
 def internal(self, col_list):
     col_list1 = col_list.get('internal')
     data = self.data[self.data.columns.intersection(col_list1)]
     imp_mean = IterativeImputer(random_state=0)
     imp_mean.fit(data)
     data_iterative = pd.DataFrame(imp_mean.transform(data),
                                   columns=data.columns,
                                   index=data.index)
     data_iterative.to_csv('internal.csv', index=False)
     return data_iterative
Example #25
0
def imputation(df):
    ## La imputacion se hace para reemplazar los nans por un valor estimado

    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    imputer = IterativeImputer(max_iter=10, random_state=0)
    imputer.fit(df)
    df_num_imp = imputer.transform(df)
    df = pd.DataFrame(df_num_imp, columns=df.columns)
    return df
Example #26
0
def impute_columns(life_expectancy_df):
    imputed_values = iterative_impute(
        life_expectancy_df,
        ["Population", "Measles", "Thinness 1-19 Years", "Under-five Deaths"],
        "Population")
    life_expectancy_df['Population'] = imputed_values[:, 0]
    imputed_values = iterative_impute(
        life_expectancy_df,
        ["Hepatitis B", "Diphtheria", "Polio", "Life Expectancy"],
        "Hepatitis B")
    life_expectancy_df['Hepatitis B'] = imputed_values[:, 0]
    imputed_values = iterative_impute(life_expectancy_df, [
        "GDP", "Percentage Expenditure", "Life Expectancy",
        "Income Composition Of Resources", "Schooling", "Alcohol", "BMI"
    ], "GDP")
    life_expectancy_df['GDP'] = imputed_values[:, 0]
    imputed_values = iterative_impute(
        life_expectancy_df,
        ["Total Expenditure", "Alcohol", "Schooling", "BMI"],
        "Total Expenditure")
    life_expectancy_df['Total Expenditure'] = imputed_values[:, 0]
    imputed_values = iterative_impute(life_expectancy_df, [
        "Alcohol", "Schooling", "Income Composition Of Resources",
        "Life Expectancy", "GDP", "Percentage Expenditure", "BMI",
        "Total Expenditure"
    ], "Alcohol")
    life_expectancy_df['Alcohol'] = imputed_values[:, 0]
    imputed_values = iterative_impute(life_expectancy_df, [
        "Income Composition Of Resources", "Life Expectancy", "BMI", "GDP",
        "Alcohol", "Diphtheria", "Percentage Expenditure", "Polio"
    ], "Income Composition Of Resources")
    imputed_values = iterative_impute(life_expectancy_df, [
        "Schooling", "Alcohol", "Income Composition Of Resources",
        "Life Expectancy", "GDP", "Percentage Expenditure", "BMI",
        "Total Expenditure"
    ], "Schooling")
    life_expectancy_df['Schooling'] = imputed_values[:, 0]
    imputed_values = iterative_impute(life_expectancy_df, [
        "Income Composition Of Resources", "Life Expectancy", "BMI", "GDP",
        "Alcohol", "Diphtheria", "Percentage Expenditure", "Polio"
    ], "Income Composition Of Resources")

    life_expectancy_df['Income Composition Of Resources'] = imputed_values[:,
                                                                           0]
    imputer = IterativeImputer(random_state=0)
    columns = [
        'Thinness 1-19 Years', 'BMI', 'Polio', 'Diphtheria', 'Life Expectancy',
        'Adult Mortality'
    ]
    data = life_expectancy_df[columns]
    imputer = imputer.fit(data)
    imputed_values = imputer.transform(data)
    life_expectancy_df[columns] = imputed_values
    return life_expectancy_df
 def iterativemethod(self):
     import numpy as np
     from sklearn.experimental import enable_iterative_imputer
     from sklearn.impute import IterativeImputer
     imp_mean = IterativeImputer(random_state=0)
     for featurem in self.missing_columns:
         param = list(set(self.data.columns) - set(featurem))
         imp_mean.fit(np.array(self.data[param]).reshape(-1, 1))
         self.data[featurem] = imp_mean.transform(
             np.array(self.data[featurem]).reshape(-1, 1))
     return self.data
Example #28
0
def MICE(df):
    columns = df.columns
    imp = IterativeImputer(max_iter=100,
                           missing_values=0,
                           random_state=random.randint(0, 1000),
                           sample_posterior=True,
                           verbose=True)
    imp.fit(df)
    res = imp.transform(df)
    df = pd.DataFrame(res, columns=columns)
    return df
Example #29
0
def main():
    time_series_clustered_demographics = pd.read_csv(clustered_timeseries_path +"TimeSeriesAggregatedClustered.csv")
    time_series_clustered_demographics_not_old = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredNotOld.csv")
    time_series_clustered_baseline = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredBaseline.csv")
    time_series_clustered_twodays = pd.read_csv(clustered_timeseries_path+"TimeSeriesAggregatedClusteredDeltaTwoDays.csv")
    time_series = pd.read_csv(data_path+"TimeSeriesAggregated.csv")


    dynamic_features = ['Hour','ALT', 'Albumin', 'Anticoagulant clinic INR', 'Bicarbonate',
           'Biochemistry (Glucose)', 'Blood Lactate', 'C-Reactive-Protein',
           'CSF Glucose', 'Creatinine', 'Creatinine Clearance.', 'D-Dimer',
           'DiasBP', 'Estimated-GFR', 'Fasting Glucose.', 'Ferritin', 'FiO2',
           'Fluid Albumin.', 'Fluid Glucose.', 'GCSEye', 'GCSMotor', 'GCSVerbal',
           'HBA1c-DCCT', 'HBA1c-IFCC', 'Hb', 'HbA1c', 'HeartRate', 'INR',
           'Lactate', 'Lactate (CSF)', 'Lactate (plasma)', 'Lactate-Dehydrogenase',
           'Lymphocytes', 'Lymphocytes (LYMP)', 'NEWS2', 'NT-pro-BNP',
           'Neutrophils', 'OxygenDelivery', 'OxygenLitres', 'OxygenSaturation',
           'PCO2', 'PCV', 'PH', 'PLT', 'PO2', 'PO2/FIO2', 'PainScore',
           'Protein/Creatinine Ratio', 'Random Glucose:', 'Random Urine pH',
           'Random-Urine-Creatinine', 'RespirationRate', 'Reticulocyte HB Content',
           'SupplementalOxygen', 'SysBP', 'Temperature', 'Troponin-I',
           'Troponin-T', 'U-albumin/creat. ratio', 'Urea', 'Urine Albumin conc.',
           'Urine Glucose', 'Urine Urea', 'Venous Bicarbonate', 'Venous PCO2',
           'Venous PO2', 'Venous pH', 'WBC', 'WBC count (CSF)',
           'WBC count (Fluid)', 'cHCO3']


    rfm=RandomForestClassifier(n_estimators=100,
                               max_depth=4)
    lrm=LogisticRegression(solver='lbfgs')


    #ExperimentI(time_series_clustered_demographics)
    #ExperimentII(time_series_clustered_demographics_not_old)
    #ExperimentIII(time_series_clustered_baseline)
    #ExperimentIV(time_series_clustered_twodays)

    dynamic_features = ['Hour', 'ALT', 'Albumin', 'Blood Lactate', 'C-Reactive-Protein',
                        'Creatinine', 'D-Dimer',
                        'DiasBP', 'Estimated-GFR', 'Ferritin', 'FiO2', 'GCSMotor', 'GCSVerbal',
                        'Hb', 'HeartRate', 'INR',
                        'Lymphocytes', 'NEWS2',
                        'Neutrophils', 'OxygenLitres', 'OxygenSaturation',
                        'PCO2', 'PCV', 'PH', 'PLT', 'PO2', 'PO2/FIO2', 'PainScore',
                        'SupplementalOxygen', 'SysBP', 'Temperature', 'Troponin-T',
                        'Urea', 'WBC', 'cHCO3']

    imp = IterativeImputer(max_iter=10, random_state=0)
    imp.fit(time_series[dynamic_features])
    time_series[dynamic_features] = imp.transform(time_series[dynamic_features])

    time_series = scale(time_series, dynamic_features)
    ExperimentV(time_series)
 def impute_model_approach(self, dataset, col):
     imp_mean = IterativeImputer(random_state=0, verbose=1, max_iter=100)
     for i in dataset.columns:
         if i == col:
             continue
         dataset = self.impute_interpolate(dataset, i)
     print(dataset.shape)
     imp_mean.fit(dataset)
     print(dataset.columns)
     X = imp_mean.transform(dataset)
     X = pd.DataFrame(X, index=dataset.index, columns=dataset.columns)
     return X
Example #31
0
def TrainBDT(featureNames, trainingData, classificationArray):
    clf = ensemble.HistGradientBoostingClassifier()
    trainingData = trainingData[featureNames]  # Remove all irrelevant columns
    if cfg.balanceClasses:
        imp = IterativeImputer()
        imp.fit(trainingData)
        trainingData = imp.transform(trainingData)
        sm = smt(sampling_strategy=1)
        trainingData, classificationArray = sm.fit_sample(
            trainingData, classificationArray)
    clfView = clf.fit(trainingData, classificationArray)
    return clfView
 def impute_data(self, selected_attributes):
     """
     X: which features to use to interpolate missing values
     y: which features to replace missing values
     """
     imp = IterativeImputer(max_iter=100, random_state=0)
     X = self.altered_dataframe[self.feature_list]
     y = self.altered_dataframe[selected_attributes]
     imp.fit(X, y)
     self.altered_dataframe = pd.DataFrame(data=imp.transform(
         self.dataframe),
                                           columns=self.feature_list)
Example #33
0
def imputeCVData(class_label,instance_label,categorical_variables,data_train,data_test,random_state,header):
    # Begin by imputing categorical variables with simple 'mode' imputation
    mode_dict = {}
    for c in data_train.columns:
        if c in categorical_variables:
            train_mode = data_train[c].mode().iloc[0]
            data_train[c].fillna(train_mode, inplace=True)
            mode_dict[c] = train_mode
    for c in data_test.columns:
        if c in categorical_variables:
            data_test[c].fillna(mode_dict[c], inplace=True)

    # Now impute remaining ordinal variables
    if instance_label == None or instance_label == 'None':
        x_train = data_train.drop([class_label], axis=1).values
        x_test = data_test.drop([class_label], axis=1).values
    else:
        x_train = data_train.drop([class_label, instance_label], axis=1).values
        x_test = data_test.drop([class_label, instance_label], axis=1).values

        inst_train = data_train[instance_label].values  # pull out instance labels in case they include text
        inst_test = data_test[instance_label].values

    y_train = data_train[class_label].values
    y_test = data_test[class_label].values

    # Impute features (x)
    imputer = IterativeImputer(random_state=random_state,max_iter=30).fit(x_train)
    x_new_train = imputer.transform(x_train)
    x_new_test = imputer.transform(x_test)

    # Recombine x and y
    if instance_label == None or instance_label == 'None':
        data_train = pd.concat([pd.DataFrame(y_train, columns=[class_label]), pd.DataFrame(x_new_train, columns=header)],axis=1, sort=False)
        data_test = pd.concat([pd.DataFrame(y_test, columns=[class_label]), pd.DataFrame(x_new_test, columns=header)], axis=1, sort=False)
    else:
        data_train = pd.concat([pd.DataFrame(y_train, columns=[class_label]), pd.DataFrame(inst_train, columns=[instance_label]),pd.DataFrame(x_new_train, columns=header)], axis=1, sort=False)
        data_test = pd.concat([pd.DataFrame(y_test, columns=[class_label]), pd.DataFrame(inst_test, columns=[instance_label]), pd.DataFrame(x_new_test, columns=header)], axis=1, sort=False)

    return data_train,data_test,imputer,mode_dict
Example #34
0
def iter_inputer(est,
                 X_train,
                 X_test,
                 est_label,
                 y_train=None,
                 y_test=None,
                 max_iter=10):
    """
    iterative imputer with est
    e.g.    est=RandomForestClassifier(n_estimators=n_estimators, n_jobs=1, max_depth=4)
    
    """

    imp = IterativeImputer(estimator=est, max_iter=max_iter)

    if not y_train is None:
        X_train['Y'] = y_train
        X_test['Y'] = y_test

    imp.fit(X_train)

    IDENT = '%s_Miter%d' % (est_label, max_iter)

    joblib.dump(imp, 'iter_imp_%s.joblib' % IDENT)

    X_train_it_imp = pd.DataFrame(imp.transform(X_train),
                                  columns=X_train.columns,
                                  index=X_train.index)
    X_test_it_imp = pd.DataFrame(imp.transform(X_test),
                                 columns=X_test.columns,
                                 index=X_test.index)

    if not y_train is None:
        X_train_it_imp = X_train_it_imp.drop(columns=['Y'])
        X_test_it_imp = X_test_it_imp.drop(columns=['Y'])

    X_train_it_imp.to_pickle('X_train_ii_%s.pkl' % IDENT)
    X_test_it_imp.to_pickle('X_test_ii_%s.pkl' % IDENT)

    return (imp, X_train_it_imp, X_test_it_imp)
Example #35
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(imputer.transform(X) ==
                      imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
Example #36
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(imputer.transform(X_test)[:, 0],
                    initial_imputer.transform(X_test)[:, 0])
Example #37
0
def test_iterative_imputer_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10,
                               verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.1)