def fill_data(train_data, test_data): imputer = KNNImputer(n_neighbors=3, weights='distance') imputer.fit(train_data) train = imputer.transform(train_data) test = imputer.transform(test_data) return train, test
def knn2(X, x_supp, neighbors=1): if x_supp is not None: x_supp.columns = X.columns imp = KNNImputer(missing_values=np.nan, weights='distance', n_neighbors=neighbors) imp.fit(pd.concat([X, x_supp], ignore_index=True)) return pd.DataFrame(imp.transform(X), columns=X.columns), pd.DataFrame( imp.transform(x_supp), columns=x_supp.columns)
def fillMissingValues(trainx_df,testx_df): imputer = KNNImputer(n_neighbors=2) imputer.fit(trainx_df) trainx_df_filled = imputer.transform(trainx_df) trainx_df_filled=pd.DataFrame(trainx_df_filled,columns=trainx_df.columns) testx_df_filled = imputer.transform(testx_df) testx_df_filled=pd.DataFrame(testx_df_filled,columns=testx_df.columns) testx_df_filled.reset_index(drop=True,inplace=True) return trainx_df_filled,testx_df_filled
def impute_regionidcity(train, validate, test): """ This function does the following: 1. Takes in the train, validate, and test datasets 2. Creates the KNNImputer object 3. Fits the object to the regionidcity feature in the train dataset 4. Transforms the regionidcity feature in the train, validate, and test datasets """ imputer = KNNImputer(n_neighbors=5) imputer.fit(train[["regionidcity"]]) train["regionidcity"] = imputer.transform(train[["regionidcity"]]) validate["regionidcity"] = imputer.transform(validate[["regionidcity"]]) test["regionidcity"] = imputer.transform(test[["regionidcity"]]) return imputer, train, validate, test
def sample_knn_prediction(matrix, test_data): """Returns knn prediction using sample of test_data""" matrix_c = np.copy(matrix.T) nbsr = KNNImputer(n_neighbors=11) idx = np.random.randint(542, size=542) mat1 = matrix[idx,:] nbsr.fit(mat1) mat_student = nbsr.transform(matrix) idx = np.random.randint(1774, size=1774) nbsr = KNNImputer(n_neighbors=21) mat2 = matrix_c[idx, :] nbsr.fit(mat2) mat_item = nbsr.transform(matrix_c).T mat_avg = (mat_item + mat_student)*0.5 return sparse_matrix_predictions(test_data, mat_avg, threshold=0.5)
class KNNReplacerIQR(KNNImputer): """Pipeline-compliant KNNReplacer, based on IQR.""" def __init__(self, n_neighbors=5): super().__init__(n_neighbors=n_neighbors) self.lower_bound = None self.upper_bound = None self.imputer = KNNImputer(n_neighbors=n_neighbors) def fit(self, x, y=None): """Computes IQR bound and fits the imputer on the data.""" x = pd.DataFrame(x) q1 = x.quantile(0.25) q3 = x.quantile(0.75) iqr = q3 - q1 self.lower_bound = q1 - (1.5 * iqr) self.upper_bound = q3 + (1.5 * iqr) self.imputer.fit( x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan)) return self def transform(self, x, y=None): """Detects outliers and replaces them with the imputer.""" x = pd.DataFrame(x) x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan, inplace=True) return self.imputer.transform(x)
class Data: def __init__(self): self.label_encoder = OneHotEncoder(sparse=False) self.imputer = KNNImputer() self.scaler = RobustScaler() train = pd.read_csv('data/train_data.csv') self.X_train = train.iloc[:, :-1] self.X_train = self.imputer.fit_transform(self.X_train) self.X_train = self.scaler.fit_transform(self.X_train) self.Y_train = np.array(train['target_class']).reshape(-1, 1) self.Y_train = self.label_encoder.fit_transform(self.Y_train) test = pd.read_csv('data/test_data.csv') self.X_test = test.iloc[:, :-1] self.X_test = self.imputer.transform(self.X_test) self.X_test = self.scaler.transform(self.X_test) self.Y_test = np.array(test['target_class']).reshape(-1, 1) self.Y_test = self.label_encoder.transform(self.Y_test) def get_training_data(self): return self.X_train, self.Y_train def get_test_data(self): return self.X_test, self.Y_test
def imputeData(self, X, imputerModel=None): if imputerModel is None: imputerModel = KNNImputer() imputerModel.fit(X) imputedData = imputerModel.transform(X) X_imp = pd.DataFrame(imputedData, columns=X.columns) return (X_imp, imputerModel)
def experiment_setting_2(X, y, runs=5, missingness=0.1): results = [] for i in range(runs): np.random.seed(i) X_missing = make_missing_random(X, missingness) ss = StratifiedKFold(shuffle=True, random_state=i) for train_index, test_index in ss.split(X, y): X_train = X_missing[train_index] y_train = y[train_index] imputer = KNNImputer() imputer.fit(X_train) X_test = imputer.transform(X_missing[test_index]) y_test = y[test_index] knnimp = KNNImputer() X_knn_full_imputed = knnimp.fit_transform(X_train) X_train_imputed = np.ones(X_train.shape) * np.nan for idx in np.argwhere(np.isnan(X_train)): X_train_imputed[idx[0], idx[1]] = X_knn_full_imputed[idx[0], idx[1]] hdt = EIGDecisionTree(max_depth=20) hdt.fit(X_train, X_train_imputed, y_train) results.append(accuracy_score(hdt.predict(X_test), y_test)) #print(get_depth(hdt.tree), get_size(hdt.tree)) return results
def test_knn_imputer_removes_all_na_features(na): X = np.array([ [1, 1, na, 1, 1, 1.0], [2, 3, na, 2, 2, 2], [3, 4, na, 3, 3, na], [6, 4, na, na, 6, 6], ]) knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X) X_transform = knn.transform(X) assert not np.isnan(X_transform).any() assert X_transform.shape == (4, 5) X_test = np.arange(0, 12).reshape(2, 6) X_transform = knn.transform(X_test) assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
def mvt_knn(df): try: st.info("The Percenatge of Value Missing in Given Data is : {:.2f}%". format(((df.isna().sum().sum()) / (df.count().sum()) * 100))) num_col = list(df.select_dtypes(include='float64').columns) knn = KNNImputer(n_neighbors=1, add_indicator=True) knn.fit(df[num_col]) knn_impute = pd.DataFrame(knn.transform(df[num_col])) df[num_col] = knn_impute.iloc[:, :df[num_col].shape[1]] clean_df = df clean_df = (df.fillna(df.mode().iloc[0])) st.dataframe(clean_df) st.write("\nEmpty rows after imputing the data: \n", clean_df.isnull().sum()) st.info("Numerical data : {}".format(list(dict(df.median()).keys()))) st.info("Categorical data : {}".format( list(df.select_dtypes(include='object').mode()))) st.write('Shape of dataframe (Rows, Columns): ', df.shape) st.write('Data description : ', df.describe()) st.line_chart(clean_df) st.info( "Only Numerical Data is treated using K-NN Method , Categorical Data is trreated using Mode" ) return clean_df except Exception as e: st.write("Oops!", e.__class__, "occurred.") return df
def handleNull(): st.write(df.head()) col1, col2 = st.beta_columns(2) cat_data = df.select_dtypes(include=['object']).copy() col1.header("Categorical data: ") col1.write(cat_data.head()) col1.write('Null values: ') col1.write(cat_data.isna().sum()) num_data = df.select_dtypes(include=['int64','float64']).copy() col2.header("Numerical data: ") col2.write(num_data.head()) action = st.sidebar.selectbox( label="Select the action", options=['Handle null values', 'Handle outliers']) if action == 'Handle null values': col2.write('Null values: ') col2.write(num_data.isna().sum()) imputer = KNNImputer(n_neighbors=4) imputer.fit(num_data) Xtrans=imputer.transform(num_data) st.write("Imputed values: ") st.dataframe(Xtrans) elif action == 'Handle outliers': outliers = [] for (columnName, columnData) in num_data.iteritems(): z=np.abs(stats.zscore(columnData.values)) outliers.append(np.where(z>3)) st.write(outliers)
def impute(train_df, test_df): """ Function that perform missing data imputation on both train and test for a unique interview period. Parameters ---------- train_df: dataframe feature names and interview-based names test_df: dataframe feature names and interview-based names Returns ------ imputed dataframe train imputed dataframe test """ knnimpute = KNNImputer(n_neighbors=ut.neighbors) col_n = [nc for nc in train_df.columns if not re.search('interview', nc)] col_out = [nc for nc in train_df.columns if re.search('interview', nc)] tmp_tr = pd.DataFrame(knnimpute.fit_transform(train_df[col_n]), columns=col_n) tmp_ts = pd.DataFrame(knnimpute.transform(test_df[col_n]), columns=col_n) tmp_tr.index = train_df.index tmp_ts.index = test_df.index for c in col_out: tmp_tr[c] = train_df[c] tmp_ts[c] = test_df[c] return tmp_tr, tmp_ts
class FeatureExtractor(BaseEstimator): def __init__(self, imputer_neighbors: int = 5): self.imputer = KNNImputer(n_neighbors=imputer_neighbors) self.cat_cols = None self.num_cols = None def fit(self, X, y=None): # convert categorical columns to categorical type self.cat_cols = [ column_name for column_name in X.columns if str(X[column_name].dtype) == 'object' ] self.num_cols = [ column_name for column_name in X.columns if column_name not in self.cat_cols ] X[self.cat_cols] = X[self.cat_cols].astype('category') # one hot encode to be able to use KNNImputation X_dummy = X.copy() X_dummy = pd.get_dummies(X, dummy_na=True) for col in self.cat_cols: X_dummy.loc[X_dummy[col + "_nan"] == 1, X_dummy.columns.str.startswith(col)] = np.nan del X_dummy[col + "_nan"] # fit imputer self.imputer.fit(X_dummy) def transform(self, X): # one hot encode to be able to use KNNImputation X_dummy = X.copy() X_dummy = pd.get_dummies(X, dummy_na=True) for col in self.cat_cols: X_dummy.loc[X_dummy[col + "_nan"] == 1, X_dummy.columns.str.startswith(col)] = np.nan del X_dummy[col + "_nan"] X_dummy = pd.DataFrame(self.imputer.transform(X_dummy.values), columns=X_dummy.columns) # revert dummification for col in self.cat_cols: X_dummy[col] = X_dummy.loc[:, X_dummy.columns.str. startswith(col)].idxmax( axis=1).str.replace(col + "_", '') X_dummy = X_dummy.loc[:, ~X_dummy.columns.str.startswith(col + "_")] # reset categorical column types X_dummy[self.cat_cols] = X_dummy[self.cat_cols].astype('category') # simplify pdays & previous X_dummy.pdays = np.where(X_dummy.pdays != 999., 1, 0) X_dummy.previous = np.where(X_dummy.previous >= 1., 1, 0) X_dummy.drop(columns=['previous','loan'], inplace=True) return X_dummy
def perform_imputation(X, imputer=None): X_feat_list = X.columns if imputer is None: imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') imputer.fit(X) np_array = imputer.transform(X) X = pd.DataFrame(np_array, columns=X_feat_list) return X, imputer
def missing_data_imputer(X: pd.DataFrame) -> pd.DataFrame: """ default n=5 for KNN Imputer """ imputer = KNNImputer() imputer.fit(X) X_transform = imputer.transform(X) df_temp = pd.DataFrame(X_transform) df_temp.columns = X.columns return df_temp
def test_knn_imputer_drops_all_nan_features(na): X1 = np.array([[na, 1], [na, 2]]) knn = KNNImputer(missing_values=na, n_neighbors=1) X1_expected = np.array([[1], [2]]) assert_allclose(knn.fit_transform(X1), X1_expected) X2 = np.array([[1, 2], [3, na]]) X2_expected = np.array([[2], [1.5]]) assert_allclose(knn.transform(X2), X2_expected)
def impute_knn(): imp = KNNImputer(n_neighbors=2, weights="uniform") X_train = [[1, 2], [np.nan, 3], [7, 6]] imp.fit(X_train) X_test = [[np.nan, 2], [6, np.nan], [7, 6]] print("X_train") print(X_train) print("imputed X_test") print(imp.transform(X_test))
def KNNimpute_DF(df): #filling in missing values with knn imputer imputer_knn = KNNImputer(n_neighbors=10) imputer_knn.fit(df) x = imputer_knn.transform(df) #casting the numpy array to dataframe df = pd.DataFrame(x) return df
def impute_missing(df, type='knn'): if type == 'knn': imputer = KNNImputer() imputer.fit(df) if type == 'iterative': imputer = IterativeImputer(random_state=0) imputer.fit(df) imputed_df = imputer.transform(df) df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns) return df
def test_knn_imputer_not_enough_valid_distances(na, weights): # Samples with needed feature has nan distance X1 = np.array([[na, 11], [na, 1], [3, na]]) X1_imputed = np.array([[3, 11], [3, 1], [3, 6]]) knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights) assert_allclose(knn.fit_transform(X1), X1_imputed) X2 = np.array([[4, na]]) X2_imputed = np.array([[4, 6]]) assert_allclose(knn.transform(X2), X2_imputed)
def predict(givencity): givencity = city_day[(city_day.AQI.notnull()) & (city_day.City == givencity)] #tell_me_null(givencity) corr = givencity.corr().AQI.sort_values(ascending=False) related = list(corr[corr > 0.6].index) #print(related) inter = givencity.loc[:, related].interpolate(method='linear') givencity.loc[:, related] = inter knn_imputer = KNNImputer(n_neighbors=3) imputing_cols = [ 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'B_X_O3_NH3', 'ParticulateMatters' ] # we eliminated city, date, Year_Month and AQI_Bucket because # they either were unique or had numerical substitutes in other fields(AQI_bucket) knn_imputer.fit(givencity[imputing_cols]) imputed = knn_imputer.transform(givencity[imputing_cols]) #givencity.loc[:, imputing_cols] = imputed #tell_me_null(givencity) givencity_aqi = givencity[['Date', 'AQI']] givencity_aqi.reset_index(inplace=True, drop=True) train_df = givencity_aqi train_df.rename(mapper={'Date': 'ds', 'AQI': 'y'}, axis=1, inplace=True) train_df m = Prophet(holidays_prior_scale=0, seasonality_prior_scale=20, n_changepoints=50) m.fit(train_df) future = m.make_future_dataframe(periods=365) #future.tail() forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() #df_cv = cross_validation(m, initial='1100 days', period='121 days', horizon = '365 days') #df_p = performance_metrics(df_cv) #print('Cross Validation accuracy:', (1 - df_p['mape'].mean())*100) predictions_df = pd.DataFrame(forecast, columns=['ds', 'yhat']) return predictions_df, m
def imputation(train_data, test_data): '''In order to substitute the NaN values rather then delete them, a kNN imputer function is used to impute the missing data. This function is based on the train set and subsequently applied on the test set. This ensures the model is completely trained on the train set rather than the test set. The inputs are the trainset and the testset, the outputs are the same sets with imputed values. ''' # impute the still existing NaN's imputer = KNNImputer(n_neighbors=3, weights="uniform") imputed_train = imputer.fit_transform(train_data) imputed_test = imputer.transform(test_data) return imputed_train, imputed_test
def remove_missing(df, missing_type=np.nan, nan_threshold=40, impute=False): missing_values = get_percentages(df, missing_type) df_features = missing_values[ missing_values['percent_missing'] < nan_threshold].index.tolist() df = df[df_features] if impute: imputer = KNNImputer() imputer.fit(df) imputed_df = imputer.transform(df) df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns) return df
def test_onnxt_knnimputer(self): x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13], [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]], dtype=numpy.float32) x_test = numpy.array( [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]], dtype=numpy.float32) kn = KNNImputer(n_neighbors=3, metric='nan_euclidean') kn.fit(x_train) model_def = to_onnx(kn, x_train) oinf = OnnxInference(model_def, runtime='python') got = oinf.run({'X': x_test}) self.assertEqual(list(sorted(got)), ['variable']) self.assertEqualArray(kn.transform(x_test), got['variable'], decimal=6)
class KNNKeepDf(BaseEstimator, TransformerMixin): """KNN imputer, but returns DF and retains column names""" def __init__(self): self.colnames_ = [] self.knn = KNNImputer() def fit(self, X, y=None): self.colnames_ = X.columns self.knn.fit(X) return self def transform(self, X, y=None, **fit_params): output = pd.DataFrame(self.knn.transform(X), columns=self.colnames_) return output
def preprocess_with_knn_imputer_minmax_scaler( train_data: np.ndarray, test_data: np.ndarray, n_neighbors: int = 5, ) -> Tuple[np.ndarray, np.ndarray]: imputer = KNNImputer(n_neighbors=n_neighbors) train_data_without_nans = imputer.fit_transform(train_data) test_data_without_nans = imputer.transform(test_data) min_max_scaler = MinMaxScaler() train_data_without_nans_scaled = min_max_scaler.fit_transform( train_data_without_nans) test_data_without_nans_scaled = min_max_scaler.transform( test_data_without_nans) return train_data_without_nans_scaled, test_data_without_nans_scaled
def impute_values(df, imp_strategy, neighbors, numeric_vars): X = convert_to_numeric(df, numeric_vars) X = df[numeric_vars].to_numpy() other_vars = list(set(df.columns) - set(numeric_vars) ) X_strings = df[other_vars].reset_index(drop=True) if imp_strategy == "knn": imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type imputed = imputer.fit_transform(X) # This is very costly # from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # imputed = fast_knn(X, k= neighbors) else: imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy) imputer.fit(X) imputed = imputer.transform(X) X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars) rv = X_strings.join(X_imputed) return rv
def naKNN(train_x, test_x): """ Sostituisce i valori mancanti nel training set e nel test set con KNNImputer(). :param train_x: training set :param test_x: test set :return: None """ getNaCount(train_x) # calcola il numero di NaN per il training set imputer = KNNImputer(n_neighbors=3) imputed_train = imputer.fit_transform(train_x.data) train_x.data = pd.DataFrame(imputed_train, columns=train_x.data.columns) save_object( imputer, 'imputer.pkl' ) # salva imputer nel file 'imputer.pkl' (serve successivamente per il test finale) if test_x is not None: imputed_test = imputer.transform(test_x.data) test_x.data = pd.DataFrame(imputed_test, columns=test_x.data.columns)
def cv_preprocessing(X_train, X_test=None, random_state=None): variables_path = r"variables.json" with open(variables_path) as f: variables = json.load(f) t1_features, cogni = variables['t1_features'], variables['cogni'] pcl = variables['questionnaires']['PCL'][:17] mice = KNNImputer() columns = X_train.columns X_train = pd.DataFrame(mice.fit_transform(X_train), columns=columns) #X_train = stds(X_train) #X_train = stats(X_train) #X_train = removal_correlated(X_train) # ss = StandardScaler() # X_train = ss.fit_transform(X_train) # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns) if X_test is not None: X_test = pd.DataFrame(mice.transform(X_test), columns=columns) #X_test = stds(X_test) #X_test = stats(X_train, X_test) #_, X_test = removal_correlated(X_train, X_test) # X_test = ss.transform(X_test) # X_test = pd.DataFrame(ss.transform(X_test), columns=columns) X_train, X_test = outliers( X_train, X_test, features=[f"T1q5.{i}" for i in range(1, 10)], name='phq9') #X_train, X_test = outliers(X_train, X_test, features=pcl, name='PCL') X_train, X_test = outliers(X_train, X_test, features=cogni, name='cogni') X_train, X_test = outliers(X_train, X_test, features=t1_features, name='t1') return X_train, X_test else: return X_train