def test_iterative_imputer_imputation_order(imputation_order): rng = np.random.RandomState(0) n = 100 d = 10 max_iter = 2 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 # this column should not be discarded by IterativeImputer imputer = IterativeImputer(missing_values=0, max_iter=max_iter, n_nearest_features=5, sample_posterior=False, min_value=0, max_value=1, verbose=1, imputation_order=imputation_order, random_state=rng) imputer.fit_transform(X) ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] assert (len(ordered_idx) // imputer.n_iter_ == imputer.n_features_with_missing_) if imputation_order == 'roman': assert np.all(ordered_idx[:d-1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d-1] == np.arange(d-1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d-1] ordered_idx_round_2 = ordered_idx[d-1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1)
def test_iterative_imputer_all_missing(): n = 100 d = 3 X = np.zeros((n, d)) imputer = IterativeImputer(missing_values=0, max_iter=1) X_imputed = imputer.fit_transform(X) assert_allclose(X_imputed, imputer.initial_imputer_.transform(X))
def test_iterative_imputer_truncated_normal_posterior(): # test that the values that are imputed using `sample_posterior=True` # with boundaries (`min_value` and `max_value` are not None) are drawn # from a distribution that looks gaussian via the Kolmogorov Smirnov test. # note that starting from the wrong random seed will make this test fail # because random sampling doesn't occur at all when the imputation # is outside of the (min_value, max_value) range pytest.importorskip("scipy", minversion="0.17.0") rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) X[0][0] = np.nan imputer = IterativeImputer(min_value=0, max_value=0.5, sample_posterior=True, random_state=rng) imputer.fit_transform(X) # generate multiple imputations for the single missing value imputations = np.array([imputer.transform(X)[0][0] for _ in range(100)]) assert all(imputations >= 0) assert all(imputations <= 0.5) mu, sigma = imputations.mean(), imputations.std() ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') if sigma == 0: sigma += 1e-12 ks_statistic, p_value = kstest((imputations - mu) / sigma, 'norm') # we want to fail to reject null hypothesis # null hypothesis: distributions are the same assert ks_statistic < 0.2 or p_value > 0.1, \ "The posterior does appear to be normal"
def test_iterative_imputer_additive_matrix(): rng = np.random.RandomState(0) n = 100 d = 10 A = rng.randn(n, d) B = rng.randn(n, d) X_filled = np.zeros(A.shape) for i in range(d): for j in range(d): X_filled[:, (i+j) % d] += (A[:, i] + B[:, j]) / 2 # a quarter is randomly missing nan_mask = rng.rand(n, d) < 0.25 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, rtol=1e-3, atol=0.01)
def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = IterativeImputer(max_iter=10, random_state=rng) m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) X_imputed = imputer.fit_transform(X) assert X_imputed.shape == (10, 2) iterative_imputer = IterativeImputer(initial_strategy=strategy) X_imputed = iterative_imputer.fit_transform(X) assert X_imputed.shape == (10, 2)
def test_iterative_imputer_rank_one(): rng = np.random.RandomState(0) d = 100 A = rng.rand(d, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(d, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=5, verbose=1, random_state=rng) X_filled = imputer.fit_transform(X_missing) assert_allclose(X_filled, X, atol=0.01)
def test_iterative_imputer_clip(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, min_value=0.1, max_value=0.2, random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-3, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def test_iterative_imputer_clip_truncnorm(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() X[:, 0] = 1 imputer = IterativeImputer(missing_values=0, max_iter=2, n_nearest_features=5, sample_posterior=True, min_value=0.1, max_value=0.2, verbose=1, imputation_order='random', random_state=rng) Xt = imputer.fit_transform(X) assert_allclose(np.min(Xt[X == 0]), 0.1) assert_allclose(np.max(Xt[X == 0]), 0.2) assert_allclose(Xt[X != 0], X[X != 0])
def test_iterative_imputer_missing_at_transform(strategy): rng = np.random.RandomState(0) n = 100 d = 10 X_train = rng.randint(low=0, high=3, size=(n, d)) X_test = rng.randint(low=0, high=3, size=(n, d)) X_train[:, 0] = 1 # definitely no missing values in 0th column X_test[0, 0] = 0 # definitely missing value in 0th column imputer = IterativeImputer(missing_values=0, max_iter=1, initial_strategy=strategy, random_state=rng).fit(X_train) initial_imputer = SimpleImputer(missing_values=0, strategy=strategy).fit(X_train) # if there were no missing values at time of fit, then imputer will # only use the initial imputer for that feature at transform assert_allclose(imputer.transform(X_test)[:, 0], initial_imputer.transform(X_test)[:, 0])
def test_iterative_imputer_transform_recovery(rank): rng = np.random.RandomState(0) n = 100 d = 100 A = rng.rand(n, rank) B = rng.rand(rank, d) X_filled = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X_filled.copy() X_missing[nan_mask] = np.nan # split up data in half n = n // 2 X_train = X_missing[:n] X_test_filled = X_filled[n:] X_test = X_missing[n:] imputer = IterativeImputer(max_iter=10, verbose=1, random_state=rng).fit(X_train) X_test_est = imputer.transform(X_test) assert_allclose(X_test_filled, X_test_est, atol=0.1)
def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X) # check that types are correct for estimators hashes = [] for triplet in imputer.imputation_sequence_: expected_type = (type(estimator) if estimator is not None else type(BayesianRidge())) assert isinstance(triplet.estimator, expected_type) hashes.append(id(triplet.estimator)) # check that each estimator is unique assert len(set(hashes)) == len(hashes)
def test_iterative_imputer_zero_iters(): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() missing_flag = X == 0 X[missing_flag] = np.nan imputer = IterativeImputer(max_iter=0) X_imputed = imputer.fit_transform(X) # with max_iter=0, only initial imputation is performed assert_allclose(X_imputed, imputer.initial_imputer_.transform(X)) # repeat but force n_iter_ to 0 imputer = IterativeImputer(max_iter=5).fit(X) # transformed should not be equal to initial imputation assert not np.all(imputer.transform(X) == imputer.initial_imputer_.transform(X)) imputer.n_iter_ = 0 # now they should be equal as only initial imputation is done assert_allclose(imputer.transform(X), imputer.initial_imputer_.transform(X))
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def model_pipeline(myestimator,mydata,myfolds,feature_selection_done = False,myfeatures = None,checknoise = False): """ If feature _selection has not been performed: Function performs Cross Validation (with scaling within folds) on the data passed through. Scales the data with RobustScaler() and Imputes the data with IterativeImputer(). Additionally adds clusters for the cities latitude and longitude Else: Performs Cross-Validation given the estimator on a subset of the features of mydata which were passed through to myfeatures Arguments @myestimator: sklearn estimator @mydata: training data with missing values and is not scaled) @myfolds: number of folds for cross validation @feature_selection_done: Boolean flag indicating if feature_selection has been done to the data in `mydata` @myfeatures: list of informative features from features @checknoise: Whether scoring for Cross-Validation should be Explained Variance """ # part 1 create location feature for data using optics clustering optics_df = mydata[['Latitude','Longitude']].copy() clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) clust.fit(optics_df) # optics_df['clust_label'] = clust.labels_ # location_max = np.max(optics_df.clust_label.unique()) #optics labels noisy samples as -1 need to replace for successful onehotencoding optics_df['clust_label'].replace([-1],location_max+1,inplace=True) #one hot encoding and combining to mydata enc = OneHotEncoder(categories='auto') optics_df_1hot = enc.fit_transform(optics_df[['clust_label']]) location_labels = ['cluster' + str(l) for l in optics_df.clust_label.unique()] optics_df_1hot = pd.DataFrame(optics_df_1hot.todense(),index = optics_df.index,columns= location_labels ) #part1done cluster columns added mydata = pd.concat([mydata,optics_df_1hot],axis=1) #part 2 drop unneccessary columns in our case mydata_labels = mydata['med_rental_rate'].copy() mydata = mydata.drop('med_rental_rate',axis =1) if feature_selection_done: mydata = mydata.loc[:,myfeatures].copy() else: mydata = mydata.drop(['city','Latitude','Longitude','change_hunits','studio_1000_1499', 'studio_1500_more', 'studio_750_999', 'onebed_1000_1499', 'onebed_1500_more', 'onebed_750_999', 'twobed_1000_1499', 'twobed_1500_more', 'twobed_750_999', 'threebed_1000_1499', 'threebed_1500_more', 'threebed_750_999'],axis=1) #part2done #part3perform cross validation while scaling and imputing on the folds skfolds = KFold(n_splits = myfolds,random_state=22,shuffle=True) results = [] mydata = (np.array(mydata)) mydata_labels = (np.array(mydata_labels)) for train_index, test_index in skfolds.split(mydata,mydata_labels): clone_est = clone(myestimator) X_train_folds = mydata[train_index] y_train_folds = mydata_labels[train_index] X_test_fold = mydata[test_index] y_test_fold = mydata_labels[test_index] #impute imputer = IterativeImputer(max_iter = 10 ,random_state =22,min_value=0) X_train_folds = imputer.fit_transform(X_train_folds) #scale only numerical attrbs which are everything but the columns which were appended earlier num_attrbs = mydata.shape[1]-len(location_labels) ct_columns = list(range(num_attrbs)) ct = ColumnTransformer( [('scale1',RobustScaler(),ct_columns)], remainder = 'passthrough') X_train_folds = ct.fit_transform(X_train_folds) clone_est.fit(X_train_folds,y_train_folds) #transform do not fit X_test_fold in order to predict X_test_fold = imputer.transform(X_test_fold) X_test_fold = ct.transform(X_test_fold) y_pred = clone_est.predict(X_test_fold) if checknoise: fold_expvar = explained_variance_score(y_test_fold,y_pred) results.append(fold_expvar) else: fold_mse = mean_squared_error(y_test_fold,y_pred) results.append(fold_mse) if checknoise: scores = (np.array([results])) else: scores = np.sqrt(np.array([results])) print('Scores',scores) print('Mean',scores.mean()) print('Standard Deviation',scores.std())
import pytest import numpy as np from scipy import sparse from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_array_equal from sklearn.experimental import enable_iterative_imputer # noqa from sklearn.impute import IterativeImputer from sklearn.impute import KNNImputer from sklearn.impute import SimpleImputer IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()] SPARSE_IMPUTERS = [SimpleImputer()] # ConvergenceWarning will be raised by the IterativeImputer @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") @pytest.mark.parametrize("imputer", IMPUTERS) def test_imputation_missing_value_in_test_array(imputer): # [Non Regression Test for issue #13968] Missing value in test set should # not throw an error and return a finite dataset train = [[1], [2]] test = [[3], [np.nan]] imputer.set_params(add_indicator=True) imputer.fit(train).transform(test)
Y = df['Style'].values lb = LabelEncoder() Y = lb.fit_transform(Y) dataFile = df.drop(['Style'], axis=1) X = dataFile.values xTrain, xTest, yTrain, yTest = train_test_split(X, Y, test_size=0.2, random_state=42) scaler = StandardScaler() xTrain = scaler.fit_transform(xTrain) xTest = scaler.transform(xTest) imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(X) xTrain = pd.DataFrame(imp.transform(xTrain)) xTest = pd.DataFrame(imp.transform(xTest)) randTree = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42, warm_start=True) randTree.fit(xTrain, yTrain) yPred = randTree.predict(xTest) accuracy = accuracy_score(yPred, yTest) print(accuracy) testFile = pd.read_csv('beers_test_nostyle.csv')
def get_imputer(self): """Create the imputer for missing data.""" imp = IterativeImputer(random_state=self.random_state) return imp
from sklearn.model_selection import train_test_split, cross_val_score from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix, classification_report, accuracy_score from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import RandomUnderSampler from collections import Counter le = preprocessing.LabelEncoder() finaldata = pd.read_csv('train.csv') finaldata = finaldata.drop( columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']) finaldata['Sex'] = le.fit_transform(finaldata['Sex']) imp = IterativeImputer() #Separa o resultado das demais variáveis Y_completed = finaldata['Survived'] #Treinamento para preenchimento dos dados faltosos imp.fit(finaldata.drop(columns='Survived')) #Preenchimento dos dados faltosos X_completed = imp.transform(finaldata.drop(columns='Survived')) #Normalização dos dados X_completed = Normalizer().fit_transform(X_completed) #Divisão da amostra para teste e treino X_trainCompleted, X_testCompleted, Y_trainCompleted, Y_testCompleted = train_test_split(
missing_values_per_row.append(x) missing_values_per_row = np.array(missing_values_per_row) missing_values_in_rows = missing_values_per_row.sum() percentage_of_missing_values = (missing_values_in_rows/len(dataframe))*100 # output results of missing values amount before imputation print('missing values:',missing_values_total) print('missing values per row:',missing_values_in_rows) print('missing values per row precentage',percentage_of_missing_values,'%') # knn impuation of the missing values. imp_median = SimpleImputer(missing_values=np.nan, strategy='median') imp_iterative = IterativeImputer(max_iter=10, random_state=42) knn_1 = KNNImputer(n_neighbors=1) knn_3 = KNNImputer(n_neighbors=3) knn_5 = KNNImputer(n_neighbors=5) df_scaled_imp_median = imp_median.fit_transform(df_scaled) df_scaled_imp_iterative = imp_iterative.fit_transform(df_scaled) df_scaled_knn1 = knn_1.fit_transform(df_scaled) df_scaled_knn3 = knn_3.fit_transform(df_scaled) df_scaled_knn5 = knn_5.fit_transform(df_scaled) df_scaled_imp_median = pd.DataFrame(data=df_scaled_imp_median, columns=df.columns.tolist()) df_scaled_imp_iterative = pd.DataFrame(data=df_scaled_imp_iterative, columns=df.columns.tolist()) df_scaled_knn1 = pd.DataFrame(data=df_scaled_knn1, columns=df.columns.tolist()) df_scaled_knn3 = pd.DataFrame(data=df_scaled_knn3, columns=df.columns.tolist()) df_scaled_knn5 = pd.DataFrame(data=df_scaled_knn5, columns=df.columns.tolist())
def _get_imputer(self): return IterativeImputer(max_iter=10)
def load_both_data(project, metric): understand_path = 'data/package_level/understand_files_all/' + project + '_understand.csv' understand_df = pd.read_csv(understand_path) understand_df = understand_df.dropna(axis=1, how='all') cols_list = understand_df.columns.values.tolist() for item in ['Kind', 'Name', 'commit_hash', 'Bugs']: if item in cols_list: cols_list.remove(item) cols_list.insert(0, item) understand_df = understand_df[cols_list] cols = understand_df.columns.tolist() understand_df = understand_df.drop_duplicates(cols[4:len(cols)]) commit_guru_file_level_path = 'data/package_level/commit_guru_file/' + project + '.csv' commit_guru_file_level_df = pd.read_csv(commit_guru_file_level_path) commit_guru_file_level_df[ 'commit_hash'] = commit_guru_file_level_df.commit_hash.str.strip('"') df = understand_df.merge(commit_guru_file_level_df, how='left', on=['commit_hash', 'Name']) cols = df.columns.tolist() cols.remove('Bugs') cols.append('Bugs') df = df[cols] for item in ['Kind', 'Name', 'commit_hash']: if item in cols: df = df.drop(labels=[item], axis=1) df = df.drop_duplicates() df.reset_index(drop=True, inplace=True) y = df.Bugs X = df.drop('Bugs', axis=1) cols = X.columns scaler = MinMaxScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=cols) imp_mean = IterativeImputer(random_state=0) X = imp_mean.fit_transform(X) X = pd.DataFrame(X, columns=cols) if metric == 'process': X = X[[ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ]] elif metric == 'product': X = X.drop([ 'file_la', 'file_ld', 'file_lt', 'file_age', 'file_ddev', 'file_nuc', 'own', 'minor', 'file_ndev', 'file_ncomm', 'file_adev', 'file_nadev', 'file_avg_nddev', 'file_avg_nadev', 'file_avg_ncomm', 'file_ns', 'file_exp', 'file_sexp', 'file_rexp', 'file_nd', 'file_sctr' ], axis=1) else: X = X return X, y
ascvd_est = pd.read_csv('../Data/cohort/' + datafile) #%% train_est2, test_est2 = split_cohort(ascvd_est, to_exclude, test_ind_col, drop = 'all') test_set_data = pd.get_dummies(test_est2, columns = [c for c in test_est2.columns if test_est2[c].dtype=='O']) train_set_data = pd.get_dummies(train_est2, columns = [c for c in train_est2.columns if train_est2[c].dtype=='O']) train_set_features = train_set_data[[f for f in train_set_data.columns if f != label]] test_set_features = test_set_data[[f for f in test_set_data.columns if f != label]] train_set_labels = train_est2[label] test_set_labels = test_est2[label] train_est2 = test_est2 = ascvd_est = None imp = IterativeImputer(add_indicator=False, estimator=None, imputation_order='ascending', initial_strategy='mean', max_iter=50, max_value=None, min_value=None, missing_values=np.nan, n_nearest_features=10, random_state=None, sample_posterior=False, tol=0.001, verbose=0) imp.fit(train_set_features) train_set_imp_features = imp.transform(train_set_features) train_set_imp_features = pd.DataFrame(train_set_imp_features, columns = train_set_features.columns) test_set_imp_features = imp.transform(test_set_features) test_set_imp_features = pd.DataFrame(test_set_imp_features, columns = test_set_features.columns) train_set_features = test_set_features = None #%% #fl2 = [[fl[0]] for fl in feat_list if 'race' not in fl[0]] # #fl2.append(['race'])
def FillNA(df, method: str = 'ffill', window: int = 10): """Fill NA values using different methods. Args: method (str): 'ffill' - fill most recent non-na value forward until another non-na value is reached 'zero' - fill with zero. Useful for sales and other data where NA does usually mean $0. 'mean' - fill all missing values with the series' overall average value 'median' - fill all missing values with the series' overall median value 'rolling mean' - fill with last n (window) values 'ffill mean biased' - simple avg of ffill and mean 'fake date' - shifts forward data over nan, thus values will have incorrect timestamps also most `method` values of pd.DataFrame.interpolate() window (int): length of rolling windows for filling na, for rolling methods """ method = str(method).replace(" ", "_") if method == 'zero': return fill_zero(df) elif method == 'ffill': return fill_forward(df) elif method == 'mean': return fill_mean(df) elif method == 'median': return fill_median(df) elif method == 'rolling_mean': return rolling_mean(df, window=window) elif method == 'rolling_mean_24': return rolling_mean(df, window=24) elif method == 'ffill_mean_biased': return biased_ffill(df) elif method == 'fake_date': return fake_date_fill(df, back_method='slice') elif method in df_interpolate_full: df = df.interpolate(method=method, order=5).fillna(method='bfill') if df.isnull().values.any(): df = fill_forward(df) return df elif method == 'IterativeImputer': cols = df.columns indx = df.index try: from sklearn.experimental import enable_iterative_imputer # noqa except Exception: pass from sklearn.impute import IterativeImputer df = IterativeImputer(random_state=0, max_iter=100).fit_transform(df) if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) df.index = indx df.columns = cols return df elif method == 'IterativeImputerExtraTrees': cols = df.columns indx = df.index try: from sklearn.experimental import enable_iterative_imputer # noqa except Exception: pass from sklearn.ensemble import ExtraTreesRegressor from sklearn.impute import IterativeImputer df = IterativeImputer( ExtraTreesRegressor(n_estimators=10, random_state=0), random_state=0, max_iter=100, ).fit_transform(df) if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) df.index = indx df.columns = cols return df elif method == 'KNNImputer': cols = df.columns indx = df.index from sklearn.impute import KNNImputer df = KNNImputer(n_neighbors=5).fit_transform(df) if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) df.index = indx df.columns = cols return df elif method is None or method == 'None': return df else: print(f"FillNA method `{str(method)}` not known, returning original") return df
def iter(Dataset): it = IterativeImputer(random_state=0, initial_strategy='median') it = it.fit_transform(Dataset) return it
{'random_state': 10}) print("Performance for best hyperparameters:") y_train_best = rf_mean_imputed.predict_proba(X_train_mean_imputed)[:, 1] print(f"- Train C-Index: {cindex(y_train, y_train_best):.4f}") y_val_best = rf_mean_imputed.predict_proba(X_val_mean_imputed)[:, 1] print(f"- Val C-Index: {cindex(y_val, y_val_best):.4f}") y_test_imp = rf_mean_imputed.predict_proba(X_test)[:, 1] print(f"- Test C-Index: {cindex(y_test, y_test_imp):.4f}") # Impute using regression on other covariates imputer = IterativeImputer(random_state=0, sample_posterior=False, max_iter=1, min_value=0) # another imputation strategy imputer.fit( X_train ) # train to preict missing values from observed values based on all other features X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns) X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns) # Perform a hyperparameter grid search to find the best-performing random forest model, and report results on the test set. # Define ranges for the random forest hyperparameter grid search search hyperparams = { # how many trees should be in the forest (int) 'n_estimators': [100],
y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) # Estimate the score after iterative imputation of the missing values # with different estimators estimators = [ BayesianRidge(), DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=10, n_jobs=-1, random_state=0), KNeighborsRegressor(n_neighbors=15) ] score_iterative_imputer = pd.DataFrame() for estimator in estimators: estimator = make_pipeline( IterativeImputer(random_state=0, estimator=estimator), br_estimator) score_iterative_imputer[estimator.__class__.__name__] = \ cross_val_score( estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS ) scores = pd.concat( [score_full_data, score_simple_imputer, score_iterative_imputer], keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1) # plot boston results fig, ax = plt.subplots(figsize=(13, 6)) means = -scores.mean() errors = scores.std()
def main(): eval_mode = True n_train = 1000 n_test = 1000 RFnan = False pca = True start = time() # LOAD DATA data = shuffle(pd.read_csv('data.csv'), random_state=seed)[:n_train + n_test] y = data['Label'] y = np.where(y == 's', 1, 0) x = data.drop(columns=['Label', "KaggleSet", "KaggleWeight", "EventId"]) weights = data['Weight'].values x = x.drop(columns=['Weight']) x = x.replace(-999, np.nan) # SPLIT X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split( x, y, weights, random_state=seed, test_size=n_test) # PREPROCESS transformers = [] cols_log = [ "DER_mass_MMC", "DER_mass_transverse_met_lep", "DER_mass_vis", "DER_pt_h", "DER_pt_ratio_lep_tau", "DER_pt_tot", "DER_sum_pt", "PRI_jet_all_pt", "PRI_lep_pt", "PRI_met", "PRI_met_sumet", "PRI_tau_pt" ] transformers.append( make_column_transformer((Shift_log(), cols_log), remainder="passthrough")) if RFnan: transformers.append(StandardScaler()) transformers.append( SimpleImputer(missing_values=np.nan, fill_value=-999999.0)) else: transformers.append(IterativeImputer(max_iter=int(1e2))) transformers.append(StandardScaler()) if pca: print("Using PCA") transformers.append(PCA(20)) for trans in transformers: X_train = trans.fit_transform(X_train) X_test = trans.transform(X_test) if not eval_mode: if RFnan: results = grid_search_rf((X_train, y_train), weights_train) print("RF nan : \n\t{}".format(results)) print("\tBest results : \n\t{}".format( results.ix[results['average'].idxmax()])) else: results_bagging, results_boosting = grid_search((X_train, y_train), weights_train) results_rf = grid_search_rf((X_train, y_train), weights_train) print("Bagging : \n\t{}".format( results_bagging.ix[results_bagging['average'].idxmax()])) print("Boosting : \n\t{}".format( results_boosting.ix[results_boosting['average'].idxmax()])) print("RF : \n\t{}".format( results_rf.ix[results_rf['average'].idxmax()])) else: if RFnan: rf_nan = RandomForestClassifier(n_estimators=2000, max_depth=None) average, std = eval_best((X_test, y_test), weights_test, rf_nan) print("RFNan : %.4f +/- %.4f" % (average, std)) else: clfs = [ RandomForestClassifier(n_estimators=2000, max_depth=50), BaggingClassifier(Perceptron(max_iter=1000), max_samples=0.5, max_features=0.5, n_estimators=1000), AdaBoostClassifier(n_estimators=50), ] for clf in clfs: average, std = eval_best((X_test, y_test), weights_test, clf) print(clf) print("%.4f +/- %.4f" % (average, std)) print("Total time : {}".format(time() - start))
print('Training Features Shape:', train_features.shape) print('Training Labels Shape:', train_labels.shape) print('Testing Features Shape:', test_features.shape) print('Testing Labels Shape:', test_labels.shape) #Count number of 1 and 0 class labels in test set from collections import Counter z, y = train_labels, test_labels Counter(y) Counter(z) # ============================================================================= # #Imputation # ============================================================================= from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp = IterativeImputer(random_state=0, max_iter = 100, imputation_order='random') imp.fit(train_features) train_features = imp.transform(train_features) test_features = imp.transform(test_features) # You can check the outcomes of imputation by executing the lines below # Please change the path accordingly #train_features.to_csv(r'T:\tbase\short\train_feature_imputation.csv') #test_features.to_csv(r'T:\tbase\short\test_feature_imputation.csv') # ============================================================================= # #Keep TransplantationID in test data for error analysis # ============================================================================= test_features = pd.DataFrame(test_features, columns=feature_list) train_features = pd.DataFrame(train_features, columns=feature_list)
#!/usr/bin/env python # coding: utf-8 # Copyright 2019 Yuhang Lin import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import KNeighborsRegressor from base import impute_df data_folder = './train_data' estimators = [ DecisionTreeRegressor(max_features='sqrt', random_state=0), ExtraTreesRegressor(n_estimators=30, random_state=0), KNeighborsRegressor(n_neighbors=5) ] max_iter = 10 names = ['decisiontree', 'extratrees', 'knn'] for i in range(len(estimators)): name = names[i] estimator = estimators[i] output_folder = "./output/iterative_imputer_{}_iter{}".format( name, max_iter) imputer = IterativeImputer(max_iter=max_iter, random_state=0, estimator=estimator) impute_df(imputer, output_folder, data_folder)
def test_iterative_imputer_transform_stochasticity(): rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = _sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
def test_iterative_imputer_error_param(max_iter, tol, error_type, warning): X = np.zeros((100, 2)) imputer = IterativeImputer(max_iter=max_iter, tol=tol) with pytest.raises(error_type, match=warning): imputer.fit_transform(X)
dat_org = pd.concat([dat_org, label], axis=1) dat_org.rename(columns={True: 'label'}, inplace=True) df1 = dat_org.copy() # separate independent and dependent variables X = df1.drop('label', axis=1) y = df1['label'] # split the dataset for training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True) # fillin the missing values using iterative imputer imp = IterativeImputer(n_nearest_features=15, max_iter=10, random_state=0) X_train = imp.fit_transform(X_train) X_test = imp.fit_transform(X_test) # convert back to dataframe X_train = pd.DataFrame(X_train, columns=df1.columns[:-1]) X_test = pd.DataFrame(X_test, columns=df1.columns[:-1]) # separate the columns from dataframe and add them back to the dataframe after scaling the dataframe misc_xtrain = X_train[[ 'tx_revision_len', 'id_infra_nod', 'dt_submit_date', 'State_Rollback', 'State_Skipped', 'State_Unknown', 'tm_submit_time' ]] misc_xtest = X_test[[ 'tx_revision_len', 'id_infra_nod', 'dt_submit_date', 'State_Rollback', 'State_Skipped', 'State_Unknown', 'tm_submit_time'
'Tot_population'] checker1 = pd.merge(checker1, pd.DataFrame(data=turnouts_r)) checker_coeff1 = np.mean( (checker1['turnout_pred'] - checker1['turnout'])**2) return checker_coeff1 turnouts_r = { 'year': [2000.0, 2004.0, 2008.0, 2012.0, 2016.0], 'turnout': [51.2, 56.7, 58.2, 54.9, 55.7] } coeffs = [] for estim in estimators: Imputer = IterativeImputer(estimator=estim) dane_ssd_imp1 = pd.DataFrame(data=Imputer.fit_transform( dane_ssd.drop(columns_out, 1)), columns=dane_ssd.drop(columns_out, 1).columns.tolist()) checker(dane_ssd_imp1) coeffs.append(checker_coeff) coeffs_comp = pd.DataFrame({ 'Estimators': ['Bayesian Ridge', 'Decision Tree', 'Extra Trees', 'KNNeighbors'], 'Turnout Coeff.': coeffs
# 1.4) Missings -> Exotic techniques ################################################################################ # The remaining missings will be imputed via Iterative Imputer: # Models each feature with missing values as a function of other features, and # uses that estimate for imputation X_train = train.drop(columns=Categorical, axis=1) X_train.drop(columns='TARGET', axis=1, inplace=True) X_test = test.drop(columns=Categorical, axis=1) # Impute from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer filler = IterativeImputer() X_train_filled = filler.fit_transform(X_train) X_test_filled = filler.transform(X_test) X_train_filled = pd.DataFrame(X_train_filled, columns=list(X_train)) X_test_filled = pd.DataFrame(X_test_filled, columns=list(X_test)) train = pd.concat([train[Categorical], X_train_filled, train['TARGET']], axis=1) test = pd.concat([test[Categorical], X_test_filled], axis=1) # Final check: miss(train, 1) miss(test, 1) # # If we need to standardize data:
# -*- coding: utf-8 -*- """ Created on Wed Oct 23 12:40:14 2019 @author: [email protected] This script explored multivariate imputation for incomplete machine learning data input. """ import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer # incomplete input data input_data = [[1, 2, 3], [3, 4, np.nan], [5, 6, 7], [8, 9, np.nan]] # create imputer object imp = IterativeImputer(max_iter=100, random_state=0) # fit imputer to the input data imp.fit(input_data) #X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] full_input_data = imp.transform(input_data) print(full_input_data)
def fit(self, X, y=None): """ Fit the imputers. Parameters ---------- X : :class:`pandas.DataFrame` Data to use to fit the imputations. y : :class:`pandas.Series` Target class; optionally specified, and used similarly to `groupby`. """ assert isinstance(X, pd.DataFrame) # start = X y_present = y is not None groupby_present = self.groupby is not None self.imputers = [] if y_present or groupby_present: # here works for one or the other, but could technically split for this assert not (groupby_present and y_present) if y_present: classes = np.unique(y) gen_mask = lambda c: np.array(y == c) if groupby_present: classes = X[self.groupby].unique() gen_mask = lambda c: np.array(X[self.groupby] == c ) # pd.Series values self.imputers = { c: { "impute": [ IterativeImputer(max_iter=self.max_iter, sample_posterior=True, random_state=ix, **self.kwargs) for ix in range(self.multiple) ], "mask": gen_mask(c), } for c in classes } msg = """Imputation transformer: {} imputers x {} classes""".format( self.multiple, len(classes)) logger.info(msg) for cls, content in self.imputers.items(): for imp in content["impute"]: imp.fit(X.loc[content["mask"], :]) else: for ix in range(self.multiple): self.imputers.append( IterativeImputer(max_iter=self.max_iter, sample_posterior=True, random_state=ix, **self.kwargs)) msg = """Imputation transformer: {} imputers""".format( self.multiple) logger.info(msg) for ix in range(self.multiple): self.imputers[ix].fit(X) return self
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)
subject_dict[ID][ses][(atlas, model, clust, _k, smooth, hpass)]["topology"]) vect_all.append(np.concatenate(vects, axis=1)) del vects X_top = np.swapaxes(np.hstack(vect_all), 0, 1) Y = np.array(id_list) try: df_summary.at[i, "grid"] = (atlas, model, clust, _k, smooth, hpass) bad_ixs = [i[1] for i in np.argwhere(np.isnan(X_top))] for m in set(bad_ixs): if (X_top.shape[0] - bad_ixs.count(m)) / X_top.shape[0] < 0.50: X_top = np.delete(X_top, m, axis=1) imp = IterativeImputer(max_iter=50, random_state=42) X_top = imp.fit_transform(X_top) scaler = StandardScaler() X_top = scaler.fit_transform(X_top) discr_stat_val, rdf = discr_stat(X_top, Y) df_summary.at[i, "discriminability"] = discr_stat_val print(discr_stat_val) # print(rdf) del discr_stat_val i += 1 except BaseException: i += 1 continue elif modality == "dwi": gen_hyperparams = ["model", "clust", "_k"] for col in cols:
red2 = red2.drop(['ID'], axis=1) #merging redwine data redall = pd.concat([red1, red2], axis=1, sort=False) #droping last three rows because they were empty redall = redall.drop([1598, 1599, 1600]) #droping column id from whitewine.csv2 white2 = white2.drop(['ID'], axis=1) #merging whitewine data whiteall = pd.concat([white1, white2], axis=1, sort=False) #merging redwine and whitewine data wineall = pd.concat([redall, whiteall], sort=False) #Initialize Deterministic Regression Imputation imp = IterativeImputer(max_iter=10, sample_posterior=False) #create new np.array without missing values wine = np.round(imp.fit_transform(wineall, 1), 2) #Initialize MaxAbsScaler() scaler = preprocessing.MaxAbsScaler() #fit wine np.array to MaxAbsScaler scaler.fit(wine) #Transform wine np.array to scaled data wine = scaler.transform(wine) x = wine[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]] y = wine[:, 15] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,
# full['Age'] = full['Age'].map(lambda x: 4 if 41 <= x <= 60 else x) # full['Age'] = full['Age'].map(lambda x: 5 if 61 <= x <= 80 else x) # print('Age 相关性分布', full.corr()['Age']) # Pclass相关性最高 # full['Age'] = full['Age'].fillna(full['Age'].mean()) # if full['Pclass'] is 1: # full['Age'] = full['Age'].fillna(39) # elif full['Pclass'] is 2: # full['Age'] = full['Age'].fillna(29) # elif full['Pclass'] is 3: # full['Age'] = full['Age'].fillna(24) # 根据None中存活分布进行选择固定为37岁最接近 # full['Age'] = full['Age'].fillna(37) # 处理缺失值方法:Iterative Imputer(迭代输入器) # Iterative Imputer 将每个缺失值的特征作为其他特征的函数来建模 input_age = full.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch']] imp = IterativeImputer(RandomForestRegressor(), max_iter=10, random_state=0) input_age = pd.DataFrame(imp.fit_transform(input_age), columns=input_age.columns) full.drop('Age', axis=1, inplace=True) full = pd.concat([full, input_age['Age']], axis=1) full_age = pd.DataFrame() full_age['Age'] = full['Age'] # 划分年龄方式一 full_age['Child'] = full_age['Age'].map(lambda x: 1 if 0 <= x <= 12 else 0) full_age['Teenager'] = full_age['Age'].map(lambda x: 1 if 12 <= x <= 20 else 0) full_age['Youth'] = full_age['Age'].map(lambda x: 1 if 21 <= x <= 41 else 0) full_age['Middle_Age'] = full_age['Age'].map(lambda x: 1 if 42 <= x <= 60 else 0) full_age['Older'] = full_age['Age'].map(lambda x: 1 if 61 <= x <= 80 else 0) # 划分年龄方式二
# ============================================================================= # #Normalisation # ============================================================================= from sklearn import preprocessing temp_features = temp_features.iloc[:, :].values #returns a numpy array min_max_scaler = preprocessing.MinMaxScaler() temp_features = min_max_scaler.fit_transform(temp_features) temp_features = pd.DataFrame(temp_features, columns=feature_list) # ============================================================================= # #Imputation # ============================================================================= from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp = IterativeImputer(random_state=0, max_iter=50, imputation_order='random') imp.fit(temp_features) features_imp = imp.transform(temp_features) imp = None import gc gc.collect() features_imp = pd.DataFrame(features_imp, columns=feature_list) features = features_imp.copy() features = features.join( pd.DataFrame(temp_features_label, columns=(['Longterm_TransplantOutcome']))) features = features.join( pd.DataFrame(temp_features_tenure, columns=(['tenure']))) features = features.join( pd.DataFrame(temp_features_transplantationIDs,
random_state=0) logger.info( f'{len(train_clips_df)} training sounds, {len(val_clips_df)} validation sounds' ) if args.inputs == 'descriptors': # The way extract.py / drum_descriptors.py is set up, all descriptor features will start with an underscore train_np = train_clips_df.filter(regex='^_', axis=1).to_numpy() test_np = val_clips_df.filter(regex='^_', axis=1).to_numpy() # There are occassionally random gaps in descriptors, so use imputation to fill in all values try: imp = pickle.load(open(IMPUTATER_PATH, 'rb')) except FileNotFoundError: logger.info(f'No cached inputer found, training') imp = IterativeImputer(max_iter=25, random_state=0) imp.fit(train_np) pickle.dump(imp, open(IMPUTATER_PATH, 'wb')) train_np = imp.transform(train_np) test_np = imp.transform(test_np) elif args.inputs == 'cnn_embeddings': train_np = np.stack(train_clips_df.cnn_embedding.values) test_np = np.stack(val_clips_df.cnn_embedding.values) scaler = preprocessing.StandardScaler().fit(train_np) train_np = scaler.transform(train_np) test_np = scaler.transform(test_np) pickle.dump(scaler, open(SCALER_PATH, 'wb')) train(args.model, train_np, train_clips_df.drum_type_labels, test_np, val_clips_df.drum_type_labels, list(unique_labels.values))
ct_2 = ColumnTransformer(remainder='drop', transformers=[('numerical', num_pipe, num_feat)]) model_2 = Pipeline([('ct', ct_2), ('classifier', DecisionTreeClassifier())]) model_2.fit(X_train, y_train) model_2_score = model_2.score(X_train, y_train) import numpy as np # Let's try to include both numerical and categorical features from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder categorical_feat = X_train.select_dtypes(include='object').columns.to_list() num_pipe_3 = Pipeline([('imputer', IterativeImputer(missing_values=np.nan, max_iter=15, random_state=0)), ('scaler', StandardScaler())]) cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant')), ('encoder', OneHotEncoder(handle_unknown='ignore'))]) ct_3 = ColumnTransformer(remainder='drop', transformers=[('numerical', num_pipe_3, num_feat), ('categorical', cat_pipe, categorical_feat)]) kt = [0.000008] pt = [] for i in kt: model_3 = Pipeline([('ct', ct_3), ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=200,