Beispiel #1
0
def test_iterative_imputer_imputation_order(imputation_order):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    max_iter = 2
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1  # this column should not be discarded by IterativeImputer

    imputer = IterativeImputer(missing_values=0,
                               max_iter=max_iter,
                               n_nearest_features=5,
                               sample_posterior=False,
                               min_value=0,
                               max_value=1,
                               verbose=1,
                               imputation_order=imputation_order,
                               random_state=rng)
    imputer.fit_transform(X)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]

    assert (len(ordered_idx) // imputer.n_iter_ ==
            imputer.n_features_with_missing_)

    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d-1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d-1]
        ordered_idx_round_2 = ordered_idx[d-1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)
Beispiel #2
0
def test_iterative_imputer_all_missing():
    n = 100
    d = 3
    X = np.zeros((n, d))
    imputer = IterativeImputer(missing_values=0, max_iter=1)
    X_imputed = imputer.fit_transform(X)
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
Beispiel #3
0
def test_iterative_imputer_truncated_normal_posterior():
    #  test that the values that are imputed using `sample_posterior=True`
    #  with boundaries (`min_value` and `max_value` are not None) are drawn
    #  from a distribution that looks gaussian via the Kolmogorov Smirnov test.
    #  note that starting from the wrong random seed will make this test fail
    #  because random sampling doesn't occur at all when the imputation
    #  is outside of the (min_value, max_value) range
    pytest.importorskip("scipy", minversion="0.17.0")
    rng = np.random.RandomState(42)

    X = rng.normal(size=(5, 5))
    X[0][0] = np.nan

    imputer = IterativeImputer(min_value=0,
                               max_value=0.5,
                               sample_posterior=True,
                               random_state=rng)

    imputer.fit_transform(X)
    # generate multiple imputations for the single missing value
    imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)])

    assert all(imputations >= 0)
    assert all(imputations <= 0.5)

    mu, sigma = imputations.mean(), imputations.std()
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    if sigma == 0:
        sigma += 1e-12
    ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm')
    # we want to fail to reject null hypothesis
    # null hypothesis: distributions are the same
    assert ks_statistic < 0.2 or p_value > 0.1, \
        "The posterior does appear to be normal"
Beispiel #4
0
def test_iterative_imputer_additive_matrix():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    A = rng.randn(n, d)
    B = rng.randn(n, d)
    X_filled = np.zeros(A.shape)
    for i in range(d):
        for j in range(d):
            X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2
    # a quarter is randomly missing
    nan_mask = rng.rand(n, d) < 0.25
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10,
                               verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
Beispiel #5
0
def test_iterative_imputer_no_missing():
    rng = np.random.RandomState(0)
    X = rng.rand(100, 100)
    X[:, 0] = np.nan
    m1 = IterativeImputer(max_iter=10, random_state=rng)
    m2 = IterativeImputer(max_iter=10, random_state=rng)
    pred1 = m1.fit(X).transform(X)
    pred2 = m2.fit_transform(X)
    # should exclude the first column entirely
    assert_allclose(X[:, 1:], pred1)
    # fit and fit_transform should both be identical
    assert_allclose(pred1, pred2)
Beispiel #6
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
Beispiel #7
0
def test_imputation_shape():
    # Verify the shapes of the imputed matrix for different strategies.
    X = np.random.randn(10, 2)
    X[::2] = np.nan

    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
        imputer = SimpleImputer(strategy=strategy)
        X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
        assert X_imputed.shape == (10, 2)
        X_imputed = imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)

        iterative_imputer = IterativeImputer(initial_strategy=strategy)
        X_imputed = iterative_imputer.fit_transform(X)
        assert X_imputed.shape == (10, 2)
Beispiel #8
0
def test_iterative_imputer_rank_one():
    rng = np.random.RandomState(0)
    d = 100
    A = rng.rand(d, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(d, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=5,
                               verbose=1,
                               random_state=rng)
    X_filled = imputer.fit_transform(X_missing)
    assert_allclose(X_filled, X, atol=0.01)
Beispiel #9
0
def test_iterative_imputer_clip():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               min_value=0.1,
                               max_value=0.2,
                               random_state=rng)

    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #10
0
def test_iterative_imputer_early_stopping():
    rng = np.random.RandomState(0)
    n = 50
    d = 5
    A = rng.rand(n, 1)
    B = rng.rand(1, d)
    X = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X.copy()
    X_missing[nan_mask] = np.nan

    imputer = IterativeImputer(max_iter=100,
                               tol=1e-3,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_100 = imputer.fit_transform(X_missing)
    assert len(imputer.imputation_sequence_) == d * imputer.n_iter_

    imputer = IterativeImputer(max_iter=imputer.n_iter_,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    X_filled_early = imputer.fit_transform(X_missing)
    assert_allclose(X_filled_100, X_filled_early, atol=1e-7)

    imputer = IterativeImputer(max_iter=100,
                               tol=0,
                               sample_posterior=False,
                               verbose=1,
                               random_state=rng)
    imputer.fit(X_missing)
    assert imputer.n_iter_ == imputer.max_iter
Beispiel #11
0
def test_iterative_imputer_clip_truncnorm():
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    X[:, 0] = 1

    imputer = IterativeImputer(missing_values=0,
                               max_iter=2,
                               n_nearest_features=5,
                               sample_posterior=True,
                               min_value=0.1,
                               max_value=0.2,
                               verbose=1,
                               imputation_order='random',
                               random_state=rng)
    Xt = imputer.fit_transform(X)
    assert_allclose(np.min(Xt[X == 0]), 0.1)
    assert_allclose(np.max(Xt[X == 0]), 0.2)
    assert_allclose(Xt[X != 0], X[X != 0])
Beispiel #12
0
def test_iterative_imputer_missing_at_transform(strategy):
    rng = np.random.RandomState(0)
    n = 100
    d = 10
    X_train = rng.randint(low=0, high=3, size=(n, d))
    X_test = rng.randint(low=0, high=3, size=(n, d))

    X_train[:, 0] = 1  # definitely no missing values in 0th column
    X_test[0, 0] = 0  # definitely missing value in 0th column

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               initial_strategy=strategy,
                               random_state=rng).fit(X_train)
    initial_imputer = SimpleImputer(missing_values=0,
                                    strategy=strategy).fit(X_train)

    # if there were no missing values at time of fit, then imputer will
    # only use the initial imputer for that feature at transform
    assert_allclose(imputer.transform(X_test)[:, 0],
                    initial_imputer.transform(X_test)[:, 0])
Beispiel #13
0
def test_iterative_imputer_transform_recovery(rank):
    rng = np.random.RandomState(0)
    n = 100
    d = 100
    A = rng.rand(n, rank)
    B = rng.rand(rank, d)
    X_filled = np.dot(A, B)
    nan_mask = rng.rand(n, d) < 0.5
    X_missing = X_filled.copy()
    X_missing[nan_mask] = np.nan

    # split up data in half
    n = n // 2
    X_train = X_missing[:n]
    X_test_filled = X_filled[n:]
    X_test = X_missing[n:]

    imputer = IterativeImputer(max_iter=10,
                               verbose=1,
                               random_state=rng).fit(X_train)
    X_test_est = imputer.transform(X_test)
    assert_allclose(X_test_filled, X_test_est, atol=0.1)
Beispiel #14
0
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)

    # check that types are correct for estimators
    hashes = []
    for triplet in imputer.imputation_sequence_:
        expected_type = (type(estimator) if estimator is not None
                         else type(BayesianRidge()))
        assert isinstance(triplet.estimator, expected_type)
        hashes.append(id(triplet.estimator))

    # check that each estimator is unique
    assert len(set(hashes)) == len(hashes)
Beispiel #15
0
def test_iterative_imputer_zero_iters():
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    missing_flag = X == 0
    X[missing_flag] = np.nan

    imputer = IterativeImputer(max_iter=0)
    X_imputed = imputer.fit_transform(X)
    # with max_iter=0, only initial imputation is performed
    assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))

    # repeat but force n_iter_ to 0
    imputer = IterativeImputer(max_iter=5).fit(X)
    # transformed should not be equal to initial imputation
    assert not np.all(imputer.transform(X) ==
                      imputer.initial_imputer_.transform(X))

    imputer.n_iter_ = 0
    # now they should be equal as only initial imputation is done
    assert_allclose(imputer.transform(X),
                    imputer.initial_imputer_.transform(X))
Beispiel #16
0
def test_iterative_imputer_verbose():
    rng = np.random.RandomState(0)

    n = 100
    d = 3
    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1)
    imputer.fit(X)
    imputer.transform(X)
    imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2)
    imputer.fit(X)
    imputer.transform(X)
def model_pipeline(myestimator,mydata,myfolds,feature_selection_done = False,myfeatures = None,checknoise = False):
    """
    If feature _selection has not been performed:
        Function performs Cross Validation (with scaling within folds) on the data passed through. 
        Scales the data with RobustScaler() and Imputes the data with IterativeImputer(). Additionally adds clusters for the cities latitude and longitude
    Else:
        Performs Cross-Validation given the estimator on a subset of the features of mydata which were passed through to myfeatures
        
    Arguments
        @myestimator: sklearn estimator
        @mydata: training data with  missing values and is not scaled)
        @myfolds: number of folds for cross validation 
        @feature_selection_done: Boolean flag indicating if feature_selection has been done to the data in `mydata`
        @myfeatures: list of informative features from features
        @checknoise: Whether scoring for Cross-Validation should be Explained Variance 
    
    """
    # part 1 create location feature for data using optics clustering
    optics_df = mydata[['Latitude','Longitude']].copy()
    clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
    clust.fit(optics_df)
    #
    optics_df['clust_label'] = clust.labels_
    #
    location_max = np.max(optics_df.clust_label.unique())
    #optics labels noisy samples as -1 need to replace for successful onehotencoding
    optics_df['clust_label'].replace([-1],location_max+1,inplace=True)
    #one hot encoding and combining to mydata
    enc = OneHotEncoder(categories='auto')
    
    optics_df_1hot = enc.fit_transform(optics_df[['clust_label']])

    location_labels = ['cluster' + str(l) for l in optics_df.clust_label.unique()]
    
    optics_df_1hot = pd.DataFrame(optics_df_1hot.todense(),index = optics_df.index,columns= location_labels )
    #part1done cluster columns added 
    
    
    mydata = pd.concat([mydata,optics_df_1hot],axis=1)
    
    #part 2 drop unneccessary columns in our case
    
    mydata_labels = mydata['med_rental_rate'].copy()
    mydata = mydata.drop('med_rental_rate',axis =1)
    
    if feature_selection_done:
        mydata = mydata.loc[:,myfeatures].copy()
    else: 
        mydata = mydata.drop(['city','Latitude','Longitude','change_hunits','studio_1000_1499', 'studio_1500_more',
       'studio_750_999', 'onebed_1000_1499', 'onebed_1500_more',
       'onebed_750_999', 'twobed_1000_1499', 'twobed_1500_more',
       'twobed_750_999', 'threebed_1000_1499', 'threebed_1500_more',
       'threebed_750_999'],axis=1)
   
    #part2done

    #part3perform cross validation while scaling and imputing on the folds
    
    skfolds = KFold(n_splits = myfolds,random_state=22,shuffle=True)
    results = []

    mydata = (np.array(mydata))
    mydata_labels = (np.array(mydata_labels))
    for train_index, test_index in skfolds.split(mydata,mydata_labels):
        clone_est = clone(myestimator)
        X_train_folds = mydata[train_index]
        y_train_folds = mydata_labels[train_index]
        X_test_fold = mydata[test_index]
        y_test_fold = mydata_labels[test_index]
        
        #impute
        imputer = IterativeImputer(max_iter = 10 ,random_state =22,min_value=0)
        X_train_folds = imputer.fit_transform(X_train_folds)
        #scale only numerical attrbs which are everything but the columns which were appended earlier
        num_attrbs = mydata.shape[1]-len(location_labels)
        ct_columns = list(range(num_attrbs))
        
        
        
        ct = ColumnTransformer(
            [('scale1',RobustScaler(),ct_columns)],
            remainder = 'passthrough')
        
        X_train_folds = ct.fit_transform(X_train_folds)
        
        
        clone_est.fit(X_train_folds,y_train_folds)
        #transform do not fit X_test_fold in order to predict
        X_test_fold = imputer.transform(X_test_fold)
        X_test_fold = ct.transform(X_test_fold)
        y_pred = clone_est.predict(X_test_fold)
        if checknoise:
            fold_expvar = explained_variance_score(y_test_fold,y_pred)
            results.append(fold_expvar)
        else:
            fold_mse = mean_squared_error(y_test_fold,y_pred)
            results.append(fold_mse)
    if checknoise:
        scores = (np.array([results]))
    else:
        scores = np.sqrt(np.array([results]))
    
    print('Scores',scores)
    print('Mean',scores.mean())
    print('Standard Deviation',scores.std())
    

    
Beispiel #18
0
import pytest

import numpy as np
from scipy import sparse

from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.utils._testing import assert_array_equal

from sklearn.experimental import enable_iterative_imputer  # noqa

from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()]
SPARSE_IMPUTERS = [SimpleImputer()]


# ConvergenceWarning will be raised by the IterativeImputer
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
@pytest.mark.parametrize("imputer", IMPUTERS)
def test_imputation_missing_value_in_test_array(imputer):
    # [Non Regression Test for issue #13968] Missing value in test set should
    # not throw an error and return a finite dataset
    train = [[1], [2]]
    test = [[3], [np.nan]]
    imputer.set_params(add_indicator=True)
    imputer.fit(train).transform(test)

Beispiel #19
0
Y = df['Style'].values
lb = LabelEncoder()
Y = lb.fit_transform(Y)
dataFile = df.drop(['Style'], axis=1)
X = dataFile.values
xTrain, xTest, yTrain, yTest = train_test_split(X,
                                                Y,
                                                test_size=0.2,
                                                random_state=42)

scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(X)

xTrain = pd.DataFrame(imp.transform(xTrain))
xTest = pd.DataFrame(imp.transform(xTest))

randTree = RandomForestClassifier(n_estimators=100,
                                  max_depth=7,
                                  random_state=42,
                                  warm_start=True)
randTree.fit(xTrain, yTrain)
yPred = randTree.predict(xTest)
accuracy = accuracy_score(yPred, yTest)
print(accuracy)

testFile = pd.read_csv('beers_test_nostyle.csv')
Beispiel #20
0
 def get_imputer(self):
     """Create the imputer for missing data."""
     imp = IterativeImputer(random_state=self.random_state)
     return imp
Beispiel #21
0
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

le = preprocessing.LabelEncoder()

finaldata = pd.read_csv('train.csv')

finaldata = finaldata.drop(
    columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'])
finaldata['Sex'] = le.fit_transform(finaldata['Sex'])

imp = IterativeImputer()

#Separa o resultado das demais variáveis
Y_completed = finaldata['Survived']

#Treinamento para preenchimento dos dados faltosos
imp.fit(finaldata.drop(columns='Survived'))

#Preenchimento dos dados faltosos
X_completed = imp.transform(finaldata.drop(columns='Survived'))

#Normalização dos dados
X_completed = Normalizer().fit_transform(X_completed)

#Divisão da amostra para teste e treino
X_trainCompleted, X_testCompleted, Y_trainCompleted, Y_testCompleted = train_test_split(
        missing_values_per_row.append(x)

    missing_values_per_row = np.array(missing_values_per_row)
    missing_values_in_rows = missing_values_per_row.sum()
    percentage_of_missing_values = (missing_values_in_rows/len(dataframe))*100

    # output results of missing values amount before imputation
    print('missing values:',missing_values_total)
    print('missing values per row:',missing_values_in_rows)
    print('missing values per row precentage',percentage_of_missing_values,'%')

    
# knn impuation of the missing values.

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imp_iterative = IterativeImputer(max_iter=10, random_state=42)
knn_1 = KNNImputer(n_neighbors=1)
knn_3 = KNNImputer(n_neighbors=3)
knn_5 = KNNImputer(n_neighbors=5)

df_scaled_imp_median = imp_median.fit_transform(df_scaled)
df_scaled_imp_iterative = imp_iterative.fit_transform(df_scaled)
df_scaled_knn1 = knn_1.fit_transform(df_scaled)
df_scaled_knn3 = knn_3.fit_transform(df_scaled)
df_scaled_knn5 = knn_5.fit_transform(df_scaled)

df_scaled_imp_median = pd.DataFrame(data=df_scaled_imp_median, columns=df.columns.tolist())
df_scaled_imp_iterative = pd.DataFrame(data=df_scaled_imp_iterative, columns=df.columns.tolist())
df_scaled_knn1 = pd.DataFrame(data=df_scaled_knn1, columns=df.columns.tolist())
df_scaled_knn3 = pd.DataFrame(data=df_scaled_knn3, columns=df.columns.tolist())
df_scaled_knn5 = pd.DataFrame(data=df_scaled_knn5, columns=df.columns.tolist())
Beispiel #23
0
 def _get_imputer(self):
     return IterativeImputer(max_iter=10)
def load_both_data(project, metric):
    understand_path = 'data/package_level/understand_files_all/' + project + '_understand.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis=1, how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name', 'commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0, item)
    understand_df = understand_df[cols_list]
    cols = understand_df.columns.tolist()
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])

    commit_guru_file_level_path = 'data/package_level/commit_guru_file/' + project + '.csv'
    commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path)
    commit_guru_file_level_df[
        'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"')

    df = understand_df.merge(commit_guru_file_level_df,
                             how='left',
                             on=['commit_hash', 'Name'])

    cols = df.columns.tolist()
    cols.remove('Bugs')
    cols.append('Bugs')
    df = df[cols]

    for item in ['Kind', 'Name', 'commit_hash']:
        if item in cols:
            df = df.drop(labels=[item], axis=1)
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)

    y = df.Bugs
    X = df.drop('Bugs', axis=1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)
    imp_mean = IterativeImputer(random_state=0)
    X = imp_mean.fit_transform(X)
    X = pd.DataFrame(X, columns=cols)

    if metric == 'process':
        X = X[[
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ]]
    elif metric == 'product':
        X = X.drop([
            'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev',
            'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev',
            'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm',
            'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd',
            'file_sctr'
        ],
                   axis=1)
    else:
        X = X
    return X, y
Beispiel #25
0
ascvd_est = pd.read_csv('../Data/cohort/' + datafile)
#%%
train_est2, test_est2 = split_cohort(ascvd_est, to_exclude, test_ind_col, drop = 'all')
test_set_data = pd.get_dummies(test_est2, columns = [c for c in test_est2.columns if test_est2[c].dtype=='O'])
train_set_data = pd.get_dummies(train_est2, columns = [c for c in train_est2.columns if train_est2[c].dtype=='O'])
train_set_features = train_set_data[[f for f in train_set_data.columns if f != label]]
test_set_features = test_set_data[[f for f in test_set_data.columns if f != label]]
train_set_labels = train_est2[label]
test_set_labels = test_est2[label]
train_est2 = test_est2 = ascvd_est = None
imp = IterativeImputer(add_indicator=False,
                                                  estimator=None,
                                                  imputation_order='ascending',
                                                  initial_strategy='mean',
                                                  max_iter=50, max_value=None,
                                                  min_value=None,
                                                  missing_values=np.nan,
                                                  n_nearest_features=10,
                                                  random_state=None,
                                                  sample_posterior=False,
                                                  tol=0.001, verbose=0)
imp.fit(train_set_features)
train_set_imp_features = imp.transform(train_set_features)
train_set_imp_features = pd.DataFrame(train_set_imp_features, columns = train_set_features.columns)
test_set_imp_features = imp.transform(test_set_features)
test_set_imp_features = pd.DataFrame(test_set_imp_features, columns = test_set_features.columns)
train_set_features = test_set_features = None
#%%
#fl2 = [[fl[0]] for fl in feat_list if 'race' not in fl[0]]
#
#fl2.append(['race'])
Beispiel #26
0
def FillNA(df, method: str = 'ffill', window: int = 10):
    """Fill NA values using different methods.

    Args:
        method (str):
            'ffill' - fill most recent non-na value forward until another non-na value is reached
            'zero' - fill with zero. Useful for sales and other data where NA does usually mean $0.
            'mean' - fill all missing values with the series' overall average value
            'median' - fill all missing values with the series' overall median value
            'rolling mean' - fill with last n (window) values
            'ffill mean biased' - simple avg of ffill and mean
            'fake date' - shifts forward data over nan, thus values will have incorrect timestamps
            also most `method` values of pd.DataFrame.interpolate()
        window (int): length of rolling windows for filling na, for rolling methods
    """
    method = str(method).replace(" ", "_")

    if method == 'zero':
        return fill_zero(df)

    elif method == 'ffill':
        return fill_forward(df)

    elif method == 'mean':
        return fill_mean(df)

    elif method == 'median':
        return fill_median(df)

    elif method == 'rolling_mean':
        return rolling_mean(df, window=window)

    elif method == 'rolling_mean_24':
        return rolling_mean(df, window=24)

    elif method == 'ffill_mean_biased':
        return biased_ffill(df)

    elif method == 'fake_date':
        return fake_date_fill(df, back_method='slice')

    elif method in df_interpolate_full:
        df = df.interpolate(method=method, order=5).fillna(method='bfill')
        if df.isnull().values.any():
            df = fill_forward(df)
        return df

    elif method == 'IterativeImputer':
        cols = df.columns
        indx = df.index
        try:
            from sklearn.experimental import enable_iterative_imputer  # noqa
        except Exception:
            pass
        from sklearn.impute import IterativeImputer

        df = IterativeImputer(random_state=0, max_iter=100).fit_transform(df)
        if not isinstance(df, pd.DataFrame):
            df = pd.DataFrame(df)
            df.index = indx
            df.columns = cols
        return df

    elif method == 'IterativeImputerExtraTrees':
        cols = df.columns
        indx = df.index
        try:
            from sklearn.experimental import enable_iterative_imputer  # noqa
        except Exception:
            pass
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.impute import IterativeImputer

        df = IterativeImputer(
            ExtraTreesRegressor(n_estimators=10, random_state=0),
            random_state=0,
            max_iter=100,
        ).fit_transform(df)
        if not isinstance(df, pd.DataFrame):
            df = pd.DataFrame(df)
            df.index = indx
            df.columns = cols
        return df

    elif method == 'KNNImputer':
        cols = df.columns
        indx = df.index
        from sklearn.impute import KNNImputer

        df = KNNImputer(n_neighbors=5).fit_transform(df)
        if not isinstance(df, pd.DataFrame):
            df = pd.DataFrame(df)
            df.index = indx
            df.columns = cols
        return df

    elif method is None or method == 'None':
        return df

    else:
        print(f"FillNA method `{str(method)}` not known, returning original")
        return df
Beispiel #27
0
def iter(Dataset):
    it = IterativeImputer(random_state=0, initial_strategy='median')
    it = it.fit_transform(Dataset)
    return it
    {'random_state': 10})

print("Performance for best hyperparameters:")

y_train_best = rf_mean_imputed.predict_proba(X_train_mean_imputed)[:, 1]
print(f"- Train C-Index: {cindex(y_train, y_train_best):.4f}")

y_val_best = rf_mean_imputed.predict_proba(X_val_mean_imputed)[:, 1]
print(f"- Val C-Index: {cindex(y_val, y_val_best):.4f}")

y_test_imp = rf_mean_imputed.predict_proba(X_test)[:, 1]
print(f"- Test C-Index: {cindex(y_test, y_test_imp):.4f}")

# Impute using regression on other covariates
imputer = IterativeImputer(random_state=0,
                           sample_posterior=False,
                           max_iter=1,
                           min_value=0)  # another imputation strategy
imputer.fit(
    X_train
)  # train to preict missing values from observed values based on all other features
X_train_imputed = pd.DataFrame(imputer.transform(X_train),
                               columns=X_train.columns)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Perform a hyperparameter grid search to find the best-performing random forest model, and report results on the test set.
# Define ranges for the random forest hyperparameter grid search search
hyperparams = {

    # how many trees should be in the forest (int)
    'n_estimators': [100],
        y_missing,
        scoring='neg_mean_squared_error',
        cv=N_SPLITS)

# Estimate the score after iterative imputation of the missing values
# with different estimators
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]
score_iterative_imputer = pd.DataFrame()
for estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=estimator), br_estimator)
    score_iterative_imputer[estimator.__class__.__name__] = \
        cross_val_score(
            estimator, X_missing, y_missing, scoring='neg_mean_squared_error',
            cv=N_SPLITS
        )

scores = pd.concat(
    [score_full_data, score_simple_imputer, score_iterative_imputer],
    keys=['Original', 'SimpleImputer', 'IterativeImputer'],
    axis=1)

# plot boston results
fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
Beispiel #30
0
def main():
    eval_mode = True
    n_train = 1000
    n_test = 1000
    RFnan = False
    pca = True

    start = time()
    # LOAD DATA
    data = shuffle(pd.read_csv('data.csv'),
                   random_state=seed)[:n_train + n_test]
    y = data['Label']
    y = np.where(y == 's', 1, 0)
    x = data.drop(columns=['Label', "KaggleSet", "KaggleWeight", "EventId"])
    weights = data['Weight'].values
    x = x.drop(columns=['Weight'])
    x = x.replace(-999, np.nan)

    # SPLIT
    X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
        x, y, weights, random_state=seed, test_size=n_test)

    # PREPROCESS
    transformers = []
    cols_log = [
        "DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis",
        "DER_pt_h", "DER_pt_ratio_lep_tau", "DER_pt_tot", "DER_sum_pt",
        "PRI_jet_all_pt", "PRI_lep_pt", "PRI_met", "PRI_met_sumet",
        "PRI_tau_pt"
    ]
    transformers.append(
        make_column_transformer((Shift_log(), cols_log),
                                remainder="passthrough"))
    if RFnan:
        transformers.append(StandardScaler())
        transformers.append(
            SimpleImputer(missing_values=np.nan, fill_value=-999999.0))
    else:
        transformers.append(IterativeImputer(max_iter=int(1e2)))
        transformers.append(StandardScaler())
        if pca:
            print("Using PCA")
            transformers.append(PCA(20))

    for trans in transformers:
        X_train = trans.fit_transform(X_train)
        X_test = trans.transform(X_test)

    if not eval_mode:
        if RFnan:
            results = grid_search_rf((X_train, y_train), weights_train)
            print("RF nan : \n\t{}".format(results))
            print("\tBest results : \n\t{}".format(
                results.ix[results['average'].idxmax()]))
        else:
            results_bagging, results_boosting = grid_search((X_train, y_train),
                                                            weights_train)
            results_rf = grid_search_rf((X_train, y_train), weights_train)
            print("Bagging : \n\t{}".format(
                results_bagging.ix[results_bagging['average'].idxmax()]))
            print("Boosting : \n\t{}".format(
                results_boosting.ix[results_boosting['average'].idxmax()]))
            print("RF : \n\t{}".format(
                results_rf.ix[results_rf['average'].idxmax()]))
    else:
        if RFnan:
            rf_nan = RandomForestClassifier(n_estimators=2000, max_depth=None)
            average, std = eval_best((X_test, y_test), weights_test, rf_nan)
            print("RFNan : %.4f +/- %.4f" % (average, std))
        else:
            clfs = [
                RandomForestClassifier(n_estimators=2000, max_depth=50),
                BaggingClassifier(Perceptron(max_iter=1000),
                                  max_samples=0.5,
                                  max_features=0.5,
                                  n_estimators=1000),
                AdaBoostClassifier(n_estimators=50),
            ]

            for clf in clfs:
                average, std = eval_best((X_test, y_test), weights_test, clf)
                print(clf)
                print("%.4f +/- %.4f" % (average, std))
    print("Total time : {}".format(time() - start))
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

#Count number of 1 and 0 class labels in test set
from collections import Counter
z, y = train_labels, test_labels
Counter(y)
Counter(z)
# =============================================================================
# #Imputation 
# =============================================================================
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
imp = IterativeImputer(random_state=0, max_iter = 100, imputation_order='random')
imp.fit(train_features)
train_features = imp.transform(train_features)
test_features = imp.transform(test_features)

# You can check the outcomes of imputation by executing the lines below
# Please change the path accordingly
#train_features.to_csv(r'T:\tbase\short\train_feature_imputation.csv')
#test_features.to_csv(r'T:\tbase\short\test_feature_imputation.csv')

# =============================================================================
# #Keep TransplantationID in test data for error analysis
# =============================================================================
test_features = pd.DataFrame(test_features, columns=feature_list)
train_features = pd.DataFrame(train_features, columns=feature_list)
#!/usr/bin/env python
# coding: utf-8
# Copyright 2019 Yuhang Lin

import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from base import impute_df

data_folder = './train_data'
estimators = [
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=30, random_state=0),
    KNeighborsRegressor(n_neighbors=5)
]
max_iter = 10

names = ['decisiontree', 'extratrees', 'knn']
for i in range(len(estimators)):
    name = names[i]
    estimator = estimators[i]
    output_folder = "./output/iterative_imputer_{}_iter{}".format(
        name, max_iter)
    imputer = IterativeImputer(max_iter=max_iter,
                               random_state=0,
                               estimator=estimator)
    impute_df(imputer, output_folder, data_folder)
Beispiel #33
0
def test_iterative_imputer_transform_stochasticity():
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
Beispiel #34
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
dat_org = pd.concat([dat_org, label], axis=1)
dat_org.rename(columns={True: 'label'}, inplace=True)

df1 = dat_org.copy()

# separate independent and dependent variables
X = df1.drop('label', axis=1)
y = df1['label']
# split the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    shuffle=True)

# fillin the missing values using iterative imputer
imp = IterativeImputer(n_nearest_features=15, max_iter=10, random_state=0)
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# convert back to dataframe
X_train = pd.DataFrame(X_train, columns=df1.columns[:-1])
X_test = pd.DataFrame(X_test, columns=df1.columns[:-1])

# separate the columns from dataframe and add them back to the dataframe after scaling the dataframe
misc_xtrain = X_train[[
    'tx_revision_len', 'id_infra_nod', 'dt_submit_date', 'State_Rollback',
    'State_Skipped', 'State_Unknown', 'tm_submit_time'
]]
misc_xtest = X_test[[
    'tx_revision_len', 'id_infra_nod', 'dt_submit_date', 'State_Rollback',
    'State_Skipped', 'State_Unknown', 'tm_submit_time'
Beispiel #36
0
        'Tot_population']
    checker1 = pd.merge(checker1, pd.DataFrame(data=turnouts_r))
    checker_coeff1 = np.mean(
        (checker1['turnout_pred'] - checker1['turnout'])**2)
    return checker_coeff1


turnouts_r = {
    'year': [2000.0, 2004.0, 2008.0, 2012.0, 2016.0],
    'turnout': [51.2, 56.7, 58.2, 54.9, 55.7]
}

coeffs = []

for estim in estimators:
    Imputer = IterativeImputer(estimator=estim)

    dane_ssd_imp1 = pd.DataFrame(data=Imputer.fit_transform(
        dane_ssd.drop(columns_out, 1)),
                                 columns=dane_ssd.drop(columns_out,
                                                       1).columns.tolist())

    checker(dane_ssd_imp1)

    coeffs.append(checker_coeff)

coeffs_comp = pd.DataFrame({
    'Estimators':
    ['Bayesian Ridge', 'Decision Tree', 'Extra Trees', 'KNNeighbors'],
    'Turnout Coeff.':
    coeffs
# 1.4) Missings -> Exotic techniques
################################################################################
# The remaining missings will be imputed via Iterative Imputer:
# Models each feature with missing values as a function of other features, and
# uses that estimate for imputation

X_train = train.drop(columns=Categorical, axis=1)
X_train.drop(columns='TARGET', axis=1, inplace=True)
X_test = test.drop(columns=Categorical, axis=1)

# Impute
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

filler = IterativeImputer()
X_train_filled = filler.fit_transform(X_train)
X_test_filled = filler.transform(X_test)

X_train_filled = pd.DataFrame(X_train_filled, columns=list(X_train))
X_test_filled = pd.DataFrame(X_test_filled, columns=list(X_test))

train = pd.concat([train[Categorical], X_train_filled, train['TARGET']],
                  axis=1)
test = pd.concat([test[Categorical], X_test_filled], axis=1)

# Final check:
miss(train, 1)
miss(test, 1)

# # If we need to standardize data:
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 23 12:40:14 2019
@author: [email protected]

This script explored multivariate imputation for incomplete
machine learning data input.


"""

import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# incomplete input data
input_data = [[1, 2, 3], [3, 4, np.nan], [5, 6, 7], [8, 9, np.nan]]

# create imputer object
imp = IterativeImputer(max_iter=100, random_state=0)
# fit imputer to the input data
imp.fit(input_data)

#X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]

full_input_data = imp.transform(input_data)
print(full_input_data)
Beispiel #39
0
    def fit(self, X, y=None):
        """
        Fit the imputers.

        Parameters
        ----------
        X : :class:`pandas.DataFrame`
            Data to use to fit the imputations.
        y : :class:`pandas.Series`
            Target class; optionally specified, and used similarly to `groupby`.
        """
        assert isinstance(X, pd.DataFrame)
        # start = X
        y_present = y is not None
        groupby_present = self.groupby is not None
        self.imputers = []
        if y_present or groupby_present:
            # here works for one or the other, but could technically split for this
            assert not (groupby_present and y_present)
            if y_present:
                classes = np.unique(y)
                gen_mask = lambda c: np.array(y == c)
            if groupby_present:
                classes = X[self.groupby].unique()
                gen_mask = lambda c: np.array(X[self.groupby] == c
                                              )  # pd.Series values
            self.imputers = {
                c: {
                    "impute": [
                        IterativeImputer(max_iter=self.max_iter,
                                         sample_posterior=True,
                                         random_state=ix,
                                         **self.kwargs)
                        for ix in range(self.multiple)
                    ],
                    "mask":
                    gen_mask(c),
                }
                for c in classes
            }

            msg = """Imputation transformer: {} imputers x {} classes""".format(
                self.multiple, len(classes))
            logger.info(msg)

            for cls, content in self.imputers.items():
                for imp in content["impute"]:
                    imp.fit(X.loc[content["mask"], :])

        else:
            for ix in range(self.multiple):
                self.imputers.append(
                    IterativeImputer(max_iter=self.max_iter,
                                     sample_posterior=True,
                                     random_state=ix,
                                     **self.kwargs))
            msg = """Imputation transformer: {} imputers""".format(
                self.multiple)
            logger.info(msg)
            for ix in range(self.multiple):
                self.imputers[ix].fit(X)

        return self
Beispiel #40
0
def test_iterative_imputer_transform_stochasticity():
    pytest.importorskip("scipy", minversion="0.17.0")
    rng1 = np.random.RandomState(0)
    rng2 = np.random.RandomState(1)
    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10,
                             random_state=rng1).toarray()

    # when sample_posterior=True, two transforms shouldn't be equal
    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               sample_posterior=True,
                               random_state=rng1)
    imputer.fit(X)

    X_fitted_1 = imputer.transform(X)
    X_fitted_2 = imputer.transform(X)

    # sufficient to assert that the means are not the same
    assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2))

    # when sample_posterior=False, and n_nearest_features=None
    # and imputation_order is not random
    # the two transforms should be identical even if rng are different
    imputer1 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng1)

    imputer2 = IterativeImputer(missing_values=0,
                                max_iter=1,
                                sample_posterior=False,
                                n_nearest_features=None,
                                imputation_order='ascending',
                                random_state=rng2)
    imputer1.fit(X)
    imputer2.fit(X)

    X_fitted_1a = imputer1.transform(X)
    X_fitted_1b = imputer1.transform(X)
    X_fitted_2 = imputer2.transform(X)

    assert_allclose(X_fitted_1a, X_fitted_1b)
    assert_allclose(X_fitted_1a, X_fitted_2)
Beispiel #41
0
                            subject_dict[ID][ses][(atlas, model, clust, _k,
                                                   smooth, hpass)]["topology"])
                    vect_all.append(np.concatenate(vects, axis=1))
                    del vects
                X_top = np.swapaxes(np.hstack(vect_all), 0, 1)

                Y = np.array(id_list)
                try:
                    df_summary.at[i, "grid"] = (atlas, model, clust, _k,
                                                smooth, hpass)
                    bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))]
                    for m in set(bad_ixs):
                        if (X_top.shape[0] -
                                bad_ixs.count(m)) / X_top.shape[0] < 0.50:
                            X_top = np.delete(X_top, m, axis=1)
                    imp = IterativeImputer(max_iter=50, random_state=42)
                    X_top = imp.fit_transform(X_top)
                    scaler = StandardScaler()
                    X_top = scaler.fit_transform(X_top)
                    discr_stat_val, rdf = discr_stat(X_top, Y)
                    df_summary.at[i, "discriminability"] = discr_stat_val
                    print(discr_stat_val)
                    # print(rdf)
                    del discr_stat_val
                    i += 1
                except BaseException:
                    i += 1
                    continue
    elif modality == "dwi":
        gen_hyperparams = ["model", "clust", "_k"]
        for col in cols:
red2 = red2.drop(['ID'], axis=1)
#merging redwine data
redall = pd.concat([red1, red2], axis=1, sort=False)
#droping last three rows because they were empty
redall = redall.drop([1598, 1599, 1600])

#droping column id from whitewine.csv2
white2 = white2.drop(['ID'], axis=1)
#merging whitewine data
whiteall = pd.concat([white1, white2], axis=1, sort=False)

#merging redwine and whitewine data
wineall = pd.concat([redall, whiteall], sort=False)

#Initialize Deterministic Regression Imputation
imp = IterativeImputer(max_iter=10, sample_posterior=False)
#create new np.array without missing values
wine = np.round(imp.fit_transform(wineall, 1), 2)

#Initialize MaxAbsScaler()
scaler = preprocessing.MaxAbsScaler()
#fit wine np.array to MaxAbsScaler
scaler.fit(wine)
#Transform wine np.array to scaled data
wine = scaler.transform(wine)

x = wine[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]]
y = wine[:, 15]
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
Beispiel #43
0
# full['Age'] = full['Age'].map(lambda x: 4 if 41 <= x <= 60 else x)
# full['Age'] = full['Age'].map(lambda x: 5 if 61 <= x <= 80 else x)
# print('Age 相关性分布', full.corr()['Age'])  # Pclass相关性最高
# full['Age'] = full['Age'].fillna(full['Age'].mean())
# if full['Pclass'] is 1:
#     full['Age'] = full['Age'].fillna(39)
# elif full['Pclass'] is 2:
#     full['Age'] = full['Age'].fillna(29)
# elif full['Pclass'] is 3:
#     full['Age'] = full['Age'].fillna(24)
# 根据None中存活分布进行选择固定为37岁最接近
# full['Age'] = full['Age'].fillna(37)
# 处理缺失值方法:Iterative Imputer(迭代输入器)
# Iterative Imputer 将每个缺失值的特征作为其他特征的函数来建模
input_age = full.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch']]
imp = IterativeImputer(RandomForestRegressor(), max_iter=10, random_state=0)
input_age = pd.DataFrame(imp.fit_transform(input_age),
                         columns=input_age.columns)
full.drop('Age', axis=1, inplace=True)
full = pd.concat([full, input_age['Age']], axis=1)

full_age = pd.DataFrame()
full_age['Age'] = full['Age']
# 划分年龄方式一
full_age['Child'] = full_age['Age'].map(lambda x: 1 if 0 <= x <= 12 else 0)
full_age['Teenager'] = full_age['Age'].map(lambda x: 1 if 12 <= x <= 20 else 0)
full_age['Youth'] = full_age['Age'].map(lambda x: 1 if 21 <= x <= 41 else 0)
full_age['Middle_Age'] = full_age['Age'].map(lambda x: 1
                                             if 42 <= x <= 60 else 0)
full_age['Older'] = full_age['Age'].map(lambda x: 1 if 61 <= x <= 80 else 0)
# 划分年龄方式二
Beispiel #44
0
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning):
    X = np.zeros((100, 2))
    imputer = IterativeImputer(max_iter=max_iter, tol=tol)
    with pytest.raises(error_type, match=warning):
        imputer.fit_transform(X)
# =============================================================================
# #Normalisation
# =============================================================================
from sklearn import preprocessing
temp_features = temp_features.iloc[:, :].values  #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
temp_features = min_max_scaler.fit_transform(temp_features)

temp_features = pd.DataFrame(temp_features, columns=feature_list)

# =============================================================================
# #Imputation
# =============================================================================
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(random_state=0, max_iter=50, imputation_order='random')
imp.fit(temp_features)
features_imp = imp.transform(temp_features)
imp = None
import gc
gc.collect()
features_imp = pd.DataFrame(features_imp, columns=feature_list)
features = features_imp.copy()

features = features.join(
    pd.DataFrame(temp_features_label,
                 columns=(['Longterm_TransplantOutcome'])))
features = features.join(
    pd.DataFrame(temp_features_tenure, columns=(['tenure'])))
features = features.join(
    pd.DataFrame(temp_features_transplantationIDs,
                                                    random_state=0)
    logger.info(
        f'{len(train_clips_df)} training sounds, {len(val_clips_df)} validation sounds'
    )

    if args.inputs == 'descriptors':
        # The way extract.py / drum_descriptors.py is set up, all descriptor features will start with an underscore
        train_np = train_clips_df.filter(regex='^_', axis=1).to_numpy()
        test_np = val_clips_df.filter(regex='^_', axis=1).to_numpy()

        # There are occassionally random gaps in descriptors, so use imputation to fill in all values
        try:
            imp = pickle.load(open(IMPUTATER_PATH, 'rb'))
        except FileNotFoundError:
            logger.info(f'No cached inputer found, training')
            imp = IterativeImputer(max_iter=25, random_state=0)
            imp.fit(train_np)
            pickle.dump(imp, open(IMPUTATER_PATH, 'wb'))
        train_np = imp.transform(train_np)
        test_np = imp.transform(test_np)
    elif args.inputs == 'cnn_embeddings':
        train_np = np.stack(train_clips_df.cnn_embedding.values)
        test_np = np.stack(val_clips_df.cnn_embedding.values)

    scaler = preprocessing.StandardScaler().fit(train_np)
    train_np = scaler.transform(train_np)
    test_np = scaler.transform(test_np)
    pickle.dump(scaler, open(SCALER_PATH, 'wb'))

    train(args.model, train_np, train_clips_df.drum_type_labels, test_np,
          val_clips_df.drum_type_labels, list(unique_labels.values))
ct_2 = ColumnTransformer(remainder='drop',
                         transformers=[('numerical', num_pipe, num_feat)])
model_2 = Pipeline([('ct', ct_2), ('classifier', DecisionTreeClassifier())])

model_2.fit(X_train, y_train)

model_2_score = model_2.score(X_train, y_train)

import numpy as np
# Let's try to include both numerical and categorical features
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
categorical_feat = X_train.select_dtypes(include='object').columns.to_list()
num_pipe_3 = Pipeline([('imputer',
                        IterativeImputer(missing_values=np.nan,
                                         max_iter=15,
                                         random_state=0)),
                       ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant')),
                     ('encoder', OneHotEncoder(handle_unknown='ignore'))])
ct_3 = ColumnTransformer(remainder='drop',
                         transformers=[('numerical', num_pipe_3, num_feat),
                                       ('categorical', cat_pipe,
                                        categorical_feat)])
kt = [0.000008]
pt = []
for i in kt:
    model_3 = Pipeline([('ct', ct_3),
                        ('classifier',
                         RandomForestClassifier(n_jobs=-1,
                                                n_estimators=200,