def impute_dynamic_dataframe(self, df_dynamic): # interpolate if gap is less than 10 timestep mask = df_dynamic[self.feat_name].copy() df_dynamic_imp = df_dynamic.copy() for column in self.feat_name: df = pd.DataFrame(df_dynamic_imp[column]) df['new'] = ((df.notnull() != df.shift().notnull()).cumsum()) df['ones'] = 1 mask[column] = ( df.groupby('new')['ones'].transform('count') < self.missing_gap_thresh) | df_dynamic_imp[column].notnull() df_dynamic_imp[self.feat_name] = df_dynamic_imp[ self.feat_name].interpolate().bfill()[mask] # add dummy variables indicator = MissingIndicator(missing_values=np.nan, features='all') X = df_dynamic_imp[self.feat_name].values if_missing = indicator.fit_transform(X) if_measured = 1 - if_missing.astype(int) dummy_names = [] for ind, feat in enumerate(self.feat_name): dummy_name = 'if_' + feat df_dynamic_imp[dummy_name] = if_measured[:, ind] dummy_names.append(dummy_name) # impute missing invasive variables with 0 and add column "index" df_dynamic_imp = df_dynamic_imp.fillna(value=0) df_dynamic_imp = df_dynamic_imp.reindex(['index', 'pid', 'ts'] + self.feat_name + dummy_names, axis=1) return df_dynamic_imp
def data_missing_indicator(data_train,var_type_dict,data_test=None): ''' 进行特缺失值标记变量衍生 data_train: 需要进行转换的训练集 var_type_dict: 变量信息记录dict data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换 return: data_train_completed 衍生完成的训练集 var_type_dict 更新完的变量信息记录dict data_test_completed 衍生完成的测试集 ''' numeric_feature = var_type_dict.get('numeric_var',[]) category_feature = var_type_dict.get('category_var',[]) print('开始进行特缺失值标记变量衍生'.center(50, '=')) ##从dict里面把特征list拿出来 is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature] print('原始数据维度:',data_train.shape) print('新增数据维度:',len(is_miss_feature)) check_unique(numeric_feature+is_miss_feature) ##数值列和类别列用指定的方法填充 miss_indicator = MissingIndicator(features='all') data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature]) data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1) print('变量衍生完成:',data_train_completed.shape) ##更新var_type_dict文件 全部加入到numeric_var当中 var_type_dict['numeric_var'] = numeric_feature+is_miss_feature ##如果测试数据不为空 那么对测试数据进行transform 并返回 if data_test is not None: data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature]) data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1) return data_train_completed,var_type_dict,data_test_completed return data_train_completed,var_type_dict
def impute_data(X, feature_name_in): """Impute numeric data""" to_replace_dict = {'na': np.nan} for i in feature_name_in: na_cnt = 0 if pd.api.types.is_string_dtype(X[i]): na_cnt = X[i].str.contains('na').sum() if na_cnt > 0: X[i] = X.replace(to_replace=to_replace_dict, value=None) indicator = MissingIndicator(error_on_new=True, features='all', missing_values=np.nan, sparse=False) X_binary_miss = indicator.fit_transform(X).astype(int) X_binary_miss_sum = np.sum(X_binary_miss, axis=0) feature_name_out = feature_name_in.copy() to_del = [] for i in range(0, len(X_binary_miss_sum)): if X_binary_miss_sum[i] > 0: feature_name_out.append(feature_name_in[i] + "_miss") else: to_del.append(i) X_binary_miss = np.delete(X_binary_miss, to_del, axis=1) imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit(X) X_tr = imp.transform(X) X_out = np.concatenate((X_tr, X_binary_miss), axis=1) #print(feature_name_out) #print(X_out) return X_out, feature_name_out
def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse): # check the format of the output with different sparse parameter X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit = arr_type(X_fit).astype(np.float64) X_trans = arr_type(X_trans).astype(np.float64) indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) if param_sparse is True: assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' elif param_sparse == 'auto' and missing_values == 0: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) elif param_sparse is False: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) else: if sparse.issparse(X_fit): assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' else: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray)
def test_missing_indicator_sparse_param(arr_type, missing_values, param_sparse): # check the format of the output with different sparse parameter X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit = arr_type(X_fit).astype(np.float64) X_trans = arr_type(X_trans).astype(np.float64) indicator = MissingIndicator(missing_values=missing_values, sparse=param_sparse) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) if param_sparse is True: assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' elif param_sparse == 'auto' and missing_values == 0: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) elif param_sparse is False: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) else: if sparse.issparse(X_fit): assert X_fit_mask.format == 'csc' assert X_trans_mask.format == 'csc' else: assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray)
def transform(self, X): """Perform imputation using interpolation. Parameters ---------- X : array-like, shape = (n_samples, n_timestamps) Data with missing values. Returns ------- X_new : array-like, shape = (n_samples, n_timestamps) Data without missing values. """ missing_values, force_all_finite = self._check_params() X = check_array(X, dtype='float64', force_all_finite=force_all_finite) n_samples, n_timestamps = X.shape indicator = MissingIndicator( missing_values=missing_values, features='all', sparse=False, ) non_missing_idx = ~(indicator.fit_transform(X)) x_new = np.arange(n_timestamps) X_imputed = np.asarray([ self._impute_one_sample(X[i], non_missing_idx[i], x_new) for i in range(n_samples) ]) return X_imputed
def test_missing_indicator_no_missing(): # check that all features are dropped if there are no missing values when # features='missing-only' (#13491) X = np.array([[1, 1], [1, 1]]) mi = MissingIndicator(features='missing-only', missing_values=-1) Xt = mi.fit_transform(X) assert Xt.shape[1] == 0
def test_missing_indicator_sparse_no_explicit_zeros(): # Check that non missing values don't become explicit zeros in the mask # generated by missing indicator when X is sparse. (#13491) X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]]) mi = MissingIndicator(features='all', missing_values=1) Xt = mi.fit_transform(X) assert Xt.getnnz() == Xt.sum()
def test_missing_indicator_no_missing(): # check that all features are dropped if there are no missing values when # features='missing-only' (#13491) X = np.array([[1, 1], [1, 1]]) mi = MissingIndicator(features='missing-only', missing_values=-1) Xt = mi.fit_transform(X) assert Xt.shape[1] == 0
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
def get_indicators(data): indicator = MissingIndicator(missing_values=np.nan, features='all') mask_data = pd.DataFrame(indicator.fit_transform(data.iloc[:, :-1])) # Rename some columns: mask_data.columns = mask_data.columns + 1 mask_data = mask_data.add_prefix('ind_') return (mask_data)
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features, n_features, features_indices): X_fit = np.array([[missing_values, missing_values, 1], [4, 2, missing_values]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]]) X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]]) # convert the input to the right array format and right dtype X_fit = arr_type(X_fit).astype(dtype) X_trans = arr_type(X_trans).astype(dtype) X_fit_expected = X_fit_expected.astype(dtype) X_trans_expected = X_trans_expected.astype(dtype) indicator = MissingIndicator(missing_values=missing_values, features=param_features, sparse=False) X_fit_mask = indicator.fit_transform(X_fit) X_trans_mask = indicator.transform(X_trans) assert X_fit_mask.shape[1] == n_features assert X_trans_mask.shape[1] == n_features assert_array_equal(indicator.features_, features_indices) assert_allclose(X_fit_mask, X_fit_expected[:, features_indices]) assert_allclose(X_trans_mask, X_trans_expected[:, features_indices]) assert X_fit_mask.dtype == bool assert X_trans_mask.dtype == bool assert isinstance(X_fit_mask, np.ndarray) assert isinstance(X_trans_mask, np.ndarray) indicator.set_params(sparse=True) X_fit_mask_sparse = indicator.fit_transform(X_fit) X_trans_mask_sparse = indicator.transform(X_trans) assert X_fit_mask_sparse.dtype == bool assert X_trans_mask_sparse.dtype == bool assert X_fit_mask_sparse.format == 'csc' assert X_trans_mask_sparse.format == 'csc' assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask) assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type): # test for sparse input and missing_value == 0 missing_values = 0 X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) # convert the input to the right array format X_fit_sparse = arr_type(X_fit) X_trans_sparse = arr_type(X_trans) indicator = MissingIndicator(missing_values=missing_values) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.fit_transform(X_fit_sparse) indicator.fit_transform(X_fit) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.transform(X_trans_sparse)
def test_missing_indicator_sparse_no_explicit_zeros(): # Check that non missing values don't become explicit zeros in the mask # generated by missing indicator when X is sparse. (#13491) X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]]) mi = MissingIndicator(features='all', missing_values=1) Xt = mi.fit_transform(X) assert Xt.getnnz() == Xt.sum()
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type): # test for sparse input and missing_value == 0 missing_values = 0 X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]]) X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]]) # convert the input to the right array format X_fit_sparse = arr_type(X_fit) X_trans_sparse = arr_type(X_trans) indicator = MissingIndicator(missing_values=missing_values) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.fit_transform(X_fit_sparse) indicator.fit_transform(X_fit) with pytest.raises(ValueError, match="Sparse input with missing_values=0"): indicator.transform(X_trans_sparse)
def encode_with_labels_and_impute(df, strategy='mean') -> pd.DataFrame: """ Encode with simple labels and impute mean (of labels) in NaNs""" indicator = MissingIndicator(features='all') missing_indicator = indicator.fit_transform(df) df = impute(df, strategy='constant') # impute dummy str NaNs df = encode_with_labels(df) # impute real np.nan back for i in range(0, df.shape[1]): missing_indicator_col = missing_indicator[:, i] df.iloc[missing_indicator_col, i] = np.nan return impute(df, strategy=strategy)
def fit_transform_missing_indicator(input_data: pd.DataFrame, db_name: str, sql: None) -> pd.DataFrame: indicator = MissingIndicator() x = indicator.fit_transform(input_data) missing_features = [ f"missing_{input_data.columns[ii]}" for ii in list(indicator.features_) ] missing_indicator_df = pd.DataFrame(x, columns=missing_features) missing_indicator_df[missing_features].replace({True: 1, False: 0}) with sqlite3.connect(db_name) as conn: query = "INSERT INTO features VALUES (?,?)" conn.execute(query, ("missing", cloudpickle.dumps(missing_features))) return input_data.merge(missing_indicator_df, left_index=True, right_index=True)
print(countNaN, end='\n\n') colNan = [ ] #usado para armazenar os nomes das colunas que apresentam valores NaN, esses nomes são criados adicionando '_bool_NaN' ao nome da coluna. # essa lista é usada mais abaixo para nomear as colunas com valores NaN. for i, val in zip(countNaN, countNaN.keys( )): #armazena os novos nomes das colunas que apresentam valores nan. if (i > 0): # se a coluna apresenta valor NaN. colNan.append(val + '_bool_NaN') indicator_nan = MissingIndicator( missing_values=np.NaN ) #cria objeto MissingIndicator indicando que os valores NaN são valores do tipo np.NaN indicator_nan = indicator_nan.fit_transform( data ) #passa o DataFrame 'dataExported' como argumento para a função 'fit_transform' com retorno um array boolean indicando valores NaN. dataFrame_indicatorNan = pd.DataFrame( indicator_nan, columns=colNan) #cria DataFrame com base no array acima print(dataFrame_indicatorNan, end='\n\n') # 'dataFrame_indicatorNan' guarda os valores NaN ocorrem, pode-se utilizar isso depois para verificar se essas indicações ajudam na construção de um melhor modelo. # É interessante armazenar esses valores antes que tratemos, e assim perdemos, as ocorrências de NaN. imputeNaN = SimpleImputer(missing_values=np.NaN, strategy='median') data_imputed_NaN = imputeNaN.fit_transform(data) data_imputed_NaN = pd.DataFrame(data_imputed_NaN, columns=data.columns.values) print( data_imputed_NaN, end='\n\n' ) # Dados em que os valores NaN foram substituídos pela mediana de cada coluna.
def missing_values_mask(X): indicator = MissingIndicator(features='all') return indicator.fit_transform(X)
for i in num_feats_imp_df: print(i) #i = 'AMT_REQ_CREDIT_BUREAU_YEAR_log' out_l, out_r, min, max = f.TurkyOutliers(imp_df, i, drop=False) if (len(out_l) | len(out_r)) > 0: imp_df[i].loc[out_l] = round(min, 3) imp_df[i].loc[out_r] = round(max, 3) #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: indicator = MissingIndicator(missing_values=np.nan) mask_missing_values_only = indicator.fit_transform(imp_df) mask_missing_values_only.shape # Num missing values imputations imp_df[num_feats_imp_df] = imp_df[num_feats_imp_df].fillna( value=imp_df[num_feats_imp_df].mean()) # Left missing values are categorical. missing_feats_cat = f.get_missing_value_feats(imp_df) par_num_df, par_cat_df = f.get_params(imp_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80
inplace=True) # Drop rows with less than 1 sample not null X.reset_index(inplace=True) # To update the index column (remove a hole) # N.B. When we reset the index, the old index is added as a column, and a new sequential index is used X.drop(['index'], axis=1, inplace=True) # To drop the old index print(X.head()) # Map of missing values # MissingIndicator - Indicator of missing values, per column from sklearn.impute import MissingIndicator X.replace( {999.0: np.NaN}, inplace=True ) # 999 => NaN, as multimple type of missing values are not supported indicator = MissingIndicator(missing_values=np.NaN) indicator = indicator.fit_transform(X) #print(indicator) indicator = pd.DataFrame( indicator, columns=['m1', 'm3']) # The only two columns in which missing values are print(indicator) # MissingIndicator - more in depth import numpy as np from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) # Creates the possible indicator columns (i.e., not all)
def ex_13_is_missing(df): indicator = MissingIndicator(missing_values=numpy.nan) res = indicator.fit_transform(df) cv2.imwrite(folder_out + 'missing.png', 255 * res) return
df_info['D_types'] = df_info.index.map(df.dtypes) df_info['Blank_count'] = df_info.index.map((df=='').sum()) return df_info data_info(train) data_info(test) ###checking event rate from collections import Counter Counter(train.target) train.target.value_counts(normalize=True) ##creating NA indicator for all the columns containing NAs mindicator = MissingIndicator(missing_values=np.nan,error_on_new=False) z = mindicator.fit_transform(train.drop('target',axis = 1)) cols_na_ind = [x+'_na_ind' for x in train.columns[mindicator.features_]] train = pd.concat([train,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1) train.head(1) z = mindicator.transform(test) cols_na_ind = [x+'_na_ind' for x in test.columns[mindicator.features_]] test = pd.concat([test,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1) test.head(1) ## Treating Null values var = 'gender' #f'count of NULLs in {var} : {train[[var]].isna().sum()[0]}' train[var].value_counts(dropna = False,normalize = True) pd.crosstab(index = train[var].fillna('Nan'), columns = train.target,margins = True,normalize='index',) var = 'enrolled_university'
#features=pd.read_csv(r'C:\Users\vznam\Downloads\PredictingRatings-master\data\features.csv') print(features.info()) print(features.shape) features.describe() # #### Поиск пропущенных значений # In[6]: from sklearn.impute import MissingIndicator indicator = MissingIndicator(missing_values=np.NaN) indicator = indicator.fit_transform(features) indicator # In[7]: features.isnull().values.any() # #### Стандартизация # Стандартизация - это преобразование, которое центрирует данные путем удаления среднего значения каждого объекта, а затем масштабирует его путем деления (непостоянных) объектов на их стандартное отклонение . После стандартизации данных среднее значение будет равно нулю, а стандартное отклонение - одному. # # Стандартизация может кардинально улучшить производительность моделей. Например, многие элементы, используемые в целевой функции алгоритма обучения (например, ядро RBF машин опорных векторов или регуляризаторы l1 и l2 линейных моделей), предполагают, что все объекты сосредоточены вокруг нуля и имеют дисперсию в том же порядке. Если у признака есть отклонение, которое на несколько порядков больше, чем у других, оно может доминировать в целевой функции и сделать оценщика неспособным учиться на других признаках правильно, как ожидалось. #
def test_missing_indicator_string(): X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object) indicator = MissingIndicator(missing_values='a', features='all') X_trans = indicator.fit_transform(X) assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]]))
def test_missing_indicator_string(): X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object) indicator = MissingIndicator(missing_values='a', features='all') X_trans = indicator.fit_transform(X) assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]]))
def test_missing_indicator_string(): X = np.array([["a", "b", "c"], ["b", "c", "a"]], dtype=object) indicator = MissingIndicator(missing_values="a", features="all") X_trans = indicator.fit_transform(X) assert_array_equal(X_trans, np.array([[True, False, False], [False, False, True]]))
imp_mean.get_params() imp_mean.transform(X) from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) indicator.features_ X1 indicator.transform(X1) X2 indicator.transform(X2) indicator_all = MissingIndicator(features='all') indicator_all.fit_transform(X1) indicator_all.fit_transform(X2) indicator_all.features_ from sklearn.preprocessing import Binarizer X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]] transformer = Binarizer() type(transformer) transformer.fit(X) transformer.transform(X) from sklearn.preprocessing import MinMaxScaler data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] scaler = MinMaxScaler() scaler.fit(data)