def imputeData(self, X, imputerModel=None): if imputerModel is None: imputerModel = KNNImputer() imputerModel.fit(X) imputedData = imputerModel.transform(X) X_imp = pd.DataFrame(imputedData, columns=X.columns) return (X_imp, imputerModel)
def experiment_setting_2(X, y, runs=5, missingness=0.1): results = [] for i in range(runs): np.random.seed(i) X_missing = make_missing_random(X, missingness) ss = StratifiedKFold(shuffle=True, random_state=i) for train_index, test_index in ss.split(X, y): X_train = X_missing[train_index] y_train = y[train_index] imputer = KNNImputer() imputer.fit(X_train) X_test = imputer.transform(X_missing[test_index]) y_test = y[test_index] knnimp = KNNImputer() X_knn_full_imputed = knnimp.fit_transform(X_train) X_train_imputed = np.ones(X_train.shape) * np.nan for idx in np.argwhere(np.isnan(X_train)): X_train_imputed[idx[0], idx[1]] = X_knn_full_imputed[idx[0], idx[1]] hdt = EIGDecisionTree(max_depth=20) hdt.fit(X_train, X_train_imputed, y_train) results.append(accuracy_score(hdt.predict(X_test), y_test)) #print(get_depth(hdt.tree), get_size(hdt.tree)) return results
class KNNReplacerIQR(KNNImputer): """Pipeline-compliant KNNReplacer, based on IQR.""" def __init__(self, n_neighbors=5): super().__init__(n_neighbors=n_neighbors) self.lower_bound = None self.upper_bound = None self.imputer = KNNImputer(n_neighbors=n_neighbors) def fit(self, x, y=None): """Computes IQR bound and fits the imputer on the data.""" x = pd.DataFrame(x) q1 = x.quantile(0.25) q3 = x.quantile(0.75) iqr = q3 - q1 self.lower_bound = q1 - (1.5 * iqr) self.upper_bound = q3 + (1.5 * iqr) self.imputer.fit( x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan)) return self def transform(self, x, y=None): """Detects outliers and replaces them with the imputer.""" x = pd.DataFrame(x) x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan, inplace=True) return self.imputer.transform(x)
def handleNull(): st.write(df.head()) col1, col2 = st.beta_columns(2) cat_data = df.select_dtypes(include=['object']).copy() col1.header("Categorical data: ") col1.write(cat_data.head()) col1.write('Null values: ') col1.write(cat_data.isna().sum()) num_data = df.select_dtypes(include=['int64','float64']).copy() col2.header("Numerical data: ") col2.write(num_data.head()) action = st.sidebar.selectbox( label="Select the action", options=['Handle null values', 'Handle outliers']) if action == 'Handle null values': col2.write('Null values: ') col2.write(num_data.isna().sum()) imputer = KNNImputer(n_neighbors=4) imputer.fit(num_data) Xtrans=imputer.transform(num_data) st.write("Imputed values: ") st.dataframe(Xtrans) elif action == 'Handle outliers': outliers = [] for (columnName, columnData) in num_data.iteritems(): z=np.abs(stats.zscore(columnData.values)) outliers.append(np.where(z>3)) st.write(outliers)
def mvt_knn(df): try: st.info("The Percenatge of Value Missing in Given Data is : {:.2f}%". format(((df.isna().sum().sum()) / (df.count().sum()) * 100))) num_col = list(df.select_dtypes(include='float64').columns) knn = KNNImputer(n_neighbors=1, add_indicator=True) knn.fit(df[num_col]) knn_impute = pd.DataFrame(knn.transform(df[num_col])) df[num_col] = knn_impute.iloc[:, :df[num_col].shape[1]] clean_df = df clean_df = (df.fillna(df.mode().iloc[0])) st.dataframe(clean_df) st.write("\nEmpty rows after imputing the data: \n", clean_df.isnull().sum()) st.info("Numerical data : {}".format(list(dict(df.median()).keys()))) st.info("Categorical data : {}".format( list(df.select_dtypes(include='object').mode()))) st.write('Shape of dataframe (Rows, Columns): ', df.shape) st.write('Data description : ', df.describe()) st.line_chart(clean_df) st.info( "Only Numerical Data is treated using K-NN Method , Categorical Data is trreated using Mode" ) return clean_df except Exception as e: st.write("Oops!", e.__class__, "occurred.") return df
def fill_data(train_data, test_data): imputer = KNNImputer(n_neighbors=3, weights='distance') imputer.fit(train_data) train = imputer.transform(train_data) test = imputer.transform(test_data) return train, test
def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence knn = KNNImputer(n_neighbors=5, add_indicator=False) knn.fit(X.values) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/knn.pkl".format(output_dir), "wb") as fp: pickle.dump(knn, fp)
class FeatureExtractor(BaseEstimator): def __init__(self, imputer_neighbors: int = 5): self.imputer = KNNImputer(n_neighbors=imputer_neighbors) self.cat_cols = None self.num_cols = None def fit(self, X, y=None): # convert categorical columns to categorical type self.cat_cols = [ column_name for column_name in X.columns if str(X[column_name].dtype) == 'object' ] self.num_cols = [ column_name for column_name in X.columns if column_name not in self.cat_cols ] X[self.cat_cols] = X[self.cat_cols].astype('category') # one hot encode to be able to use KNNImputation X_dummy = X.copy() X_dummy = pd.get_dummies(X, dummy_na=True) for col in self.cat_cols: X_dummy.loc[X_dummy[col + "_nan"] == 1, X_dummy.columns.str.startswith(col)] = np.nan del X_dummy[col + "_nan"] # fit imputer self.imputer.fit(X_dummy) def transform(self, X): # one hot encode to be able to use KNNImputation X_dummy = X.copy() X_dummy = pd.get_dummies(X, dummy_na=True) for col in self.cat_cols: X_dummy.loc[X_dummy[col + "_nan"] == 1, X_dummy.columns.str.startswith(col)] = np.nan del X_dummy[col + "_nan"] X_dummy = pd.DataFrame(self.imputer.transform(X_dummy.values), columns=X_dummy.columns) # revert dummification for col in self.cat_cols: X_dummy[col] = X_dummy.loc[:, X_dummy.columns.str. startswith(col)].idxmax( axis=1).str.replace(col + "_", '') X_dummy = X_dummy.loc[:, ~X_dummy.columns.str.startswith(col + "_")] # reset categorical column types X_dummy[self.cat_cols] = X_dummy[self.cat_cols].astype('category') # simplify pdays & previous X_dummy.pdays = np.where(X_dummy.pdays != 999., 1, 0) X_dummy.previous = np.where(X_dummy.previous >= 1., 1, 0) X_dummy.drop(columns=['previous','loan'], inplace=True) return X_dummy
def knn_imputer(X, args={}): """ KNN插值法 """ from sklearn.impute import KNNImputer imp = KNNImputer(**args) imp.fit(X) return imp
def perform_imputation(X, imputer=None): X_feat_list = X.columns if imputer is None: imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') imputer.fit(X) np_array = imputer.transform(X) X = pd.DataFrame(np_array, columns=X_feat_list) return X, imputer
def fillMissingValues(trainx_df,testx_df): imputer = KNNImputer(n_neighbors=2) imputer.fit(trainx_df) trainx_df_filled = imputer.transform(trainx_df) trainx_df_filled=pd.DataFrame(trainx_df_filled,columns=trainx_df.columns) testx_df_filled = imputer.transform(testx_df) testx_df_filled=pd.DataFrame(testx_df_filled,columns=testx_df.columns) testx_df_filled.reset_index(drop=True,inplace=True) return trainx_df_filled,testx_df_filled
def knn2(X, x_supp, neighbors=1): if x_supp is not None: x_supp.columns = X.columns imp = KNNImputer(missing_values=np.nan, weights='distance', n_neighbors=neighbors) imp.fit(pd.concat([X, x_supp], ignore_index=True)) return pd.DataFrame(imp.transform(X), columns=X.columns), pd.DataFrame( imp.transform(x_supp), columns=x_supp.columns)
def missing_data_imputer(X: pd.DataFrame) -> pd.DataFrame: """ default n=5 for KNN Imputer """ imputer = KNNImputer() imputer.fit(X) X_transform = imputer.transform(X) df_temp = pd.DataFrame(X_transform) df_temp.columns = X.columns return df_temp
def impute_missing(df, type='knn'): if type == 'knn': imputer = KNNImputer() imputer.fit(df) if type == 'iterative': imputer = IterativeImputer(random_state=0) imputer.fit(df) imputed_df = imputer.transform(df) df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns) return df
def KNNimpute_DF(df): #filling in missing values with knn imputer imputer_knn = KNNImputer(n_neighbors=10) imputer_knn.fit(df) x = imputer_knn.transform(df) #casting the numpy array to dataframe df = pd.DataFrame(x) return df
def impute_knn(): imp = KNNImputer(n_neighbors=2, weights="uniform") X_train = [[1, 2], [np.nan, 3], [7, 6]] imp.fit(X_train) X_test = [[np.nan, 2], [6, np.nan], [7, 6]] print("X_train") print(X_train) print("imputed X_test") print(imp.transform(X_test))
def _get_imputer(self): """return KNN imputer for nan values Returns ------- sklearn.impute.KNNImputer """ imputer = KNNImputer(n_neighbors=2, weights="uniform") imputer.fit(self.X_train) return imputer
def predict(givencity): givencity = city_day[(city_day.AQI.notnull()) & (city_day.City == givencity)] #tell_me_null(givencity) corr = givencity.corr().AQI.sort_values(ascending=False) related = list(corr[corr > 0.6].index) #print(related) inter = givencity.loc[:, related].interpolate(method='linear') givencity.loc[:, related] = inter knn_imputer = KNNImputer(n_neighbors=3) imputing_cols = [ 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'B_X_O3_NH3', 'ParticulateMatters' ] # we eliminated city, date, Year_Month and AQI_Bucket because # they either were unique or had numerical substitutes in other fields(AQI_bucket) knn_imputer.fit(givencity[imputing_cols]) imputed = knn_imputer.transform(givencity[imputing_cols]) #givencity.loc[:, imputing_cols] = imputed #tell_me_null(givencity) givencity_aqi = givencity[['Date', 'AQI']] givencity_aqi.reset_index(inplace=True, drop=True) train_df = givencity_aqi train_df.rename(mapper={'Date': 'ds', 'AQI': 'y'}, axis=1, inplace=True) train_df m = Prophet(holidays_prior_scale=0, seasonality_prior_scale=20, n_changepoints=50) m.fit(train_df) future = m.make_future_dataframe(periods=365) #future.tail() forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() #df_cv = cross_validation(m, initial='1100 days', period='121 days', horizon = '365 days') #df_p = performance_metrics(df_cv) #print('Cross Validation accuracy:', (1 - df_p['mape'].mean())*100) predictions_df = pd.DataFrame(forecast, columns=['ds', 'yhat']) return predictions_df, m
class KNNKeepDf(BaseEstimator, TransformerMixin): """KNN imputer, but returns DF and retains column names""" def __init__(self): self.colnames_ = [] self.knn = KNNImputer() def fit(self, X, y=None): self.colnames_ = X.columns self.knn.fit(X) return self def transform(self, X, y=None, **fit_params): output = pd.DataFrame(self.knn.transform(X), columns=self.colnames_) return output
def test_onnxt_knnimputer(self): x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13], [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]], dtype=numpy.float32) x_test = numpy.array( [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]], dtype=numpy.float32) kn = KNNImputer(n_neighbors=3, metric='nan_euclidean') kn.fit(x_train) model_def = to_onnx(kn, x_train) oinf = OnnxInference(model_def, runtime='python') got = oinf.run({'X': x_test}) self.assertEqual(list(sorted(got)), ['variable']) self.assertEqualArray(kn.transform(x_test), got['variable'], decimal=6)
def remove_missing(df, missing_type=np.nan, nan_threshold=40, impute=False): missing_values = get_percentages(df, missing_type) df_features = missing_values[ missing_values['percent_missing'] < nan_threshold].index.tolist() df = df[df_features] if impute: imputer = KNNImputer() imputer.fit(df) imputed_df = imputer.transform(df) df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns) return df
def impute_regionidcity(train, validate, test): """ This function does the following: 1. Takes in the train, validate, and test datasets 2. Creates the KNNImputer object 3. Fits the object to the regionidcity feature in the train dataset 4. Transforms the regionidcity feature in the train, validate, and test datasets """ imputer = KNNImputer(n_neighbors=5) imputer.fit(train[["regionidcity"]]) train["regionidcity"] = imputer.transform(train[["regionidcity"]]) validate["regionidcity"] = imputer.transform(validate[["regionidcity"]]) test["regionidcity"] = imputer.transform(test[["regionidcity"]]) return imputer, train, validate, test
def sample_knn_prediction(matrix, test_data): """Returns knn prediction using sample of test_data""" matrix_c = np.copy(matrix.T) nbsr = KNNImputer(n_neighbors=11) idx = np.random.randint(542, size=542) mat1 = matrix[idx,:] nbsr.fit(mat1) mat_student = nbsr.transform(matrix) idx = np.random.randint(1774, size=1774) nbsr = KNNImputer(n_neighbors=21) mat2 = matrix_c[idx, :] nbsr.fit(mat2) mat_item = nbsr.transform(matrix_c).T mat_avg = (mat_item + mat_student)*0.5 return sparse_matrix_predictions(test_data, mat_avg, threshold=0.5)
def MVKNN(f, g, x, col): st.text("KNN Imputer") from sklearn.impute import KNNImputer imp = KNNImputer(n_neighbors=2) ch6 = st.radio("Do you want to slice the table:", ("Yes", "No"), key=f) if ch6 == "Yes": try: col_sel = st.multiselect("Please select the columns", col, key=g) imputer = imp.fit(x[col_sel]) x[col_sel] = imputer.transform(x[col_sel]) except: st.info("Select atlest one column") else: imputer = imp.fit(x) x = imputer.transform(x) return x
def impute_values(df, imp_strategy, neighbors, numeric_vars): X = convert_to_numeric(df, numeric_vars) X = df[numeric_vars].to_numpy() other_vars = list(set(df.columns) - set(numeric_vars) ) X_strings = df[other_vars].reset_index(drop=True) if imp_strategy == "knn": imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type imputed = imputer.fit_transform(X) # This is very costly # from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html # imputed = fast_knn(X, k= neighbors) else: imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy) imputer.fit(X) imputed = imputer.transform(X) X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars) rv = X_strings.join(X_imputed) return rv
def transform(self, X): data = X.copy() df_str = data.select_dtypes(include=['object']) df_num = data.select_dtypes(include=['float']) impute_str = SimpleImputer( missing_values=np.nan, # Los valores faltantes son de tipo ``np.nan`` (estandar Pandas) strategy='most_frequent', # La estrategia escogida es reemplazar por una constante verbose=0, copy=True ) impute_str.fit(X=df_str) df_str = pd.DataFrame.from_records( data=impute_str.transform( X=df_str ), # El resultado SimpleImputer.transform(<<pandas dataframe>>) es una lista de listas columns=df_str.columns # Las columnas originals deben ser conservadas en esta transformación ) print(df_str.columns.values.tolist()) # .mode()[0] - gives first category name # replace nan values with most occured category dum_df = pd.get_dummies(df_str, columns=df_str.columns.values.tolist() ) df_str = df_str.drop(df_str.columns.values.tolist(), axis = 1) df_str = df_str.join(dum_df) impute_nums = KNNImputer( missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False) impute_nums.fit(X=df_num) df_num = pd.DataFrame.from_records( data=impute_nums.transform( X=df_num ), # El resultado SimpleImputer.transform(<<pandas dataframe>>) es una lista de listas columns=df_num.columns # Las columnas originals deben ser conservadas en esta transformación ) df = df_num.join( df_str) return df
def imputer(self, method='knn'): ''' Impute missing data to a missing data Parameters ---------- column_name : string Name of the column to impute data Returns ------- dataframe : DataFrame Return updated dataframe of the missing data from the column. imp : object imputer created with the data. ''' print("Impute missing data using: " + method) # feature_type = self._feature_type_detector(column_name) # if feature_type == "class": strategy = "median" # if feature_type == "continuous": strategy = "mean" # if feature_type == "categorical": strategy = "most_frequent" # if method == 'simple': imp = SimpleImputer(strategy=strategy) # if method == 'iterative': imp = IterativeImputer(max_iter=10, initial_strategy=strategy) if method == 'knn': imp = KNNImputer(n_neighbors=5, weights="uniform", add_indicator=True) imp.fit(self.dataframe) transformed_data = imp.transform(self.dataframe) new_length_added = len(transformed_data[0]) - self.dataframe.shape[1] new_column_name = DataMethod.get_new_column_name( length_new_matrix=new_length_added, prefix="kNN_NaN_indicator") self.dataframe = pd.DataFrame(transformed_data, columns=list(self.dataframe.columns) + new_column_name) return self.dataframe, imp
def zillow_impute_city(df): df['haversine_distance'] = [ haversine(x, y) for x, y in zip(df.latitude / 1000000, df.longitude / 1000000) ] knn_imputer = KNNImputer(n_neighbors=1) knn_imputer.fit(df[[ 'haversine_distance', 'regionidcity', ]]) c = pd.DataFrame(knn_imputer.transform(df[[ 'haversine_distance', 'regionidcity', ]]), columns=['haversine', 'regionid_city'], index=df.parcelid) df = pd.merge(df, c, left_on='parcelid', right_on='parcelid') df = df.drop(columns=['haversine', 'regionidcity']) return df
def handleNull(df): col1, col2 = st.beta_columns(2) cat_data = df.select_dtypes(include=['object']).copy() col1.header("Categorical data: ") col1.write(cat_data.head()) col1.write('Null values: ') col1.write(cat_data.isna().sum()) num_data = df.select_dtypes(include=['int64','float64']).copy() col2.header("Numerical data: ") col2.write(num_data.head()) action = st.sidebar.selectbox( label="Select the action", options=['Handle null values', 'Handle outliers']) if action == 'Handle null values': col2.write('Null values: ') col2.write(num_data.isna().sum()) imputer = KNNImputer(n_neighbors=4) imputer.fit(num_data) Xtrans=imputer.transform(num_data) st.write("Imputed values: ") st.dataframe(Xtrans) elif action == 'Handle outliers': st.sidebar.write("Outlier plot settings: ") x_val = st.sidebar.selectbox(label="Select x-axis value", options=non_numeric_columns) y_val = st.sidebar.selectbox(label="Select y-axis value", options=numeric_columns) colour = st.sidebar.selectbox(label="Select color value", options=non_numeric_columns) plot=px.box(df, x = x_val, y = y_val, color=colour) st.plotly_chart(plot) if st.button('Remove Outliers'): st.write(df.shape) rowNums = [] for column in num_data: med = num_data[column].median() List=abs(num_data-med) cond=List.median()*4.5 num_data[column] = List[~(List>cond)] st.write("Modified dataset") st.dataframe(num_data) st.write(num_data.shape)
def zillow_impute(df): df.heatingorsystemdesc = df.heatingorsystemdesc.fillna('None') df.heatingorsystemtypeid = df.heatingorsystemtypeid.fillna(13) df.buildingqualitytypeid = df.buildingqualitytypeid.fillna( df.buildingqualitytypeid.median()) df = df.drop( columns=['calculatedbathnbr', 'propertyzoningdesc', 'unitcnt']) df['haversine_distance'] = [ haversine(x, y) for x, y in zip(df.latitude / 1000000, df.longitude / 1000000) ] knn_imputer = KNNImputer(n_neighbors=1) knn_imputer.fit(df[['haversine_distance', 'regionidcity']]) c = pd.DataFrame(knn_imputer.transform( df[['haversine_distance', 'regionidcity']]), columns=['haversine', 'regionid_city'], index=df.parcelid) df = pd.merge(df, c, left_on='parcelid', right_on='parcelid') df = df.drop(columns=['haversine', 'regionidcity']) df = df.dropna() return df