Ejemplo n.º 1
0
 def imputeData(self, X, imputerModel=None):
     if imputerModel is None:
         imputerModel = KNNImputer()
         imputerModel.fit(X)
     imputedData = imputerModel.transform(X)
     X_imp = pd.DataFrame(imputedData, columns=X.columns)
     return (X_imp, imputerModel)
Ejemplo n.º 2
0
def experiment_setting_2(X, y, runs=5, missingness=0.1):
    results = []
    for i in range(runs):
        np.random.seed(i)
        X_missing = make_missing_random(X, missingness)

        ss = StratifiedKFold(shuffle=True, random_state=i)

        for train_index, test_index in ss.split(X, y):
            X_train = X_missing[train_index]
            y_train = y[train_index]
            imputer = KNNImputer()
            imputer.fit(X_train)
            X_test = imputer.transform(X_missing[test_index])
            y_test = y[test_index]

            knnimp = KNNImputer()
            X_knn_full_imputed = knnimp.fit_transform(X_train)
            X_train_imputed = np.ones(X_train.shape) * np.nan
            for idx in np.argwhere(np.isnan(X_train)):
                X_train_imputed[idx[0], idx[1]] = X_knn_full_imputed[idx[0],
                                                                     idx[1]]

            hdt = EIGDecisionTree(max_depth=20)
            hdt.fit(X_train, X_train_imputed, y_train)
            results.append(accuracy_score(hdt.predict(X_test), y_test))
            #print(get_depth(hdt.tree), get_size(hdt.tree))

    return results
Ejemplo n.º 3
0
class KNNReplacerIQR(KNNImputer):
    """Pipeline-compliant KNNReplacer, based on IQR."""
    def __init__(self, n_neighbors=5):
        super().__init__(n_neighbors=n_neighbors)
        self.lower_bound = None
        self.upper_bound = None
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, x, y=None):
        """Computes IQR bound and fits the imputer on the data."""
        x = pd.DataFrame(x)
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        self.lower_bound = q1 - (1.5 * iqr)
        self.upper_bound = q3 + (1.5 * iqr)
        self.imputer.fit(
            x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                    np.nan))
        return self

    def transform(self, x, y=None):
        """Detects outliers and replaces them with the imputer."""
        x = pd.DataFrame(x)
        x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                np.nan,
                inplace=True)
        return self.imputer.transform(x)
Ejemplo n.º 4
0
def handleNull():
    st.write(df.head())
    col1, col2 = st.beta_columns(2)
    cat_data = df.select_dtypes(include=['object']).copy()
    col1.header("Categorical data: ")
    col1.write(cat_data.head())
    col1.write('Null values: ') 
    col1.write(cat_data.isna().sum())
    num_data = df.select_dtypes(include=['int64','float64']).copy()
    col2.header("Numerical data: ")
    col2.write(num_data.head())
    action = st.sidebar.selectbox( label="Select the action",
        options=['Handle null values', 'Handle outliers'])     
    
    if action == 'Handle null values':
        col2.write('Null values: ') 
        col2.write(num_data.isna().sum())
        imputer = KNNImputer(n_neighbors=4)
        imputer.fit(num_data)
        Xtrans=imputer.transform(num_data)
        st.write("Imputed values: ")
        st.dataframe(Xtrans)
    elif action == 'Handle outliers':
        
        outliers = []
        
        for (columnName, columnData) in num_data.iteritems(): 
            z=np.abs(stats.zscore(columnData.values))   
            outliers.append(np.where(z>3))
        st.write(outliers)
Ejemplo n.º 5
0
def mvt_knn(df):
    try:

        st.info("The Percenatge of Value Missing in Given Data is : {:.2f}%".
                format(((df.isna().sum().sum()) / (df.count().sum()) * 100)))
        num_col = list(df.select_dtypes(include='float64').columns)
        knn = KNNImputer(n_neighbors=1, add_indicator=True)
        knn.fit(df[num_col])
        knn_impute = pd.DataFrame(knn.transform(df[num_col]))
        df[num_col] = knn_impute.iloc[:, :df[num_col].shape[1]]
        clean_df = df
        clean_df = (df.fillna(df.mode().iloc[0]))
        st.dataframe(clean_df)
        st.write("\nEmpty rows  after imputing the data: \n",
                 clean_df.isnull().sum())
        st.info("Numerical data : {}".format(list(dict(df.median()).keys())))
        st.info("Categorical data : {}".format(
            list(df.select_dtypes(include='object').mode())))
        st.write('Shape of dataframe (Rows, Columns): ', df.shape)
        st.write('Data description : ', df.describe())
        st.line_chart(clean_df)
        st.info(
            "Only Numerical Data is treated using K-NN Method , Categorical Data is trreated using Mode"
        )
        return clean_df

    except Exception as e:
        st.write("Oops!", e.__class__, "occurred.")
        return df
Ejemplo n.º 6
0
def fill_data(train_data, test_data):
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputer.fit(train_data)
    train = imputer.transform(train_data)
    test = imputer.transform(test_data)

    return train, test
Ejemplo n.º 7
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    knn = KNNImputer(n_neighbors=5, add_indicator=False)
    knn.fit(X.values)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/knn.pkl".format(output_dir), "wb") as fp:
            pickle.dump(knn, fp)
Ejemplo n.º 8
0
class FeatureExtractor(BaseEstimator):
    def __init__(self, imputer_neighbors: int = 5):
        self.imputer = KNNImputer(n_neighbors=imputer_neighbors)
        self.cat_cols = None
        self.num_cols = None

    def fit(self, X, y=None):
        # convert categorical columns to categorical type
        self.cat_cols = [
            column_name for column_name in X.columns
            if str(X[column_name].dtype) == 'object'
        ]
        self.num_cols = [
            column_name for column_name in X.columns
            if column_name not in self.cat_cols
        ]
        X[self.cat_cols] = X[self.cat_cols].astype('category')

        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        # fit imputer
        self.imputer.fit(X_dummy)

    def transform(self, X):
        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        X_dummy = pd.DataFrame(self.imputer.transform(X_dummy.values),
                               columns=X_dummy.columns)

        # revert dummification
        for col in self.cat_cols:
            X_dummy[col] = X_dummy.loc[:,
                                       X_dummy.columns.str.
                                       startswith(col)].idxmax(
                                           axis=1).str.replace(col + "_", '')
            X_dummy = X_dummy.loc[:,
                                  ~X_dummy.columns.str.startswith(col + "_")]

        # reset categorical column types
        X_dummy[self.cat_cols] = X_dummy[self.cat_cols].astype('category')

        # simplify pdays & previous
        X_dummy.pdays = np.where(X_dummy.pdays != 999., 1, 0)
        X_dummy.previous = np.where(X_dummy.previous >= 1., 1, 0)
        X_dummy.drop(columns=['previous','loan'], inplace=True)

        return X_dummy
Ejemplo n.º 9
0
def knn_imputer(X, args={}):
    """
    KNN插值法
    """
    from sklearn.impute import KNNImputer
    imp = KNNImputer(**args)
    imp.fit(X)
    return imp
Ejemplo n.º 10
0
def perform_imputation(X, imputer=None):
    X_feat_list = X.columns
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
        imputer.fit(X)
    np_array = imputer.transform(X)
    X = pd.DataFrame(np_array, columns=X_feat_list)
    return X, imputer
Ejemplo n.º 11
0
def fillMissingValues(trainx_df,testx_df):
    imputer = KNNImputer(n_neighbors=2)
    imputer.fit(trainx_df)
    trainx_df_filled = imputer.transform(trainx_df)
    trainx_df_filled=pd.DataFrame(trainx_df_filled,columns=trainx_df.columns)
    testx_df_filled = imputer.transform(testx_df)
    testx_df_filled=pd.DataFrame(testx_df_filled,columns=testx_df.columns)
    testx_df_filled.reset_index(drop=True,inplace=True)
    return trainx_df_filled,testx_df_filled
Ejemplo n.º 12
0
def knn2(X, x_supp, neighbors=1):
    if x_supp is not None:
        x_supp.columns = X.columns
    imp = KNNImputer(missing_values=np.nan,
                     weights='distance',
                     n_neighbors=neighbors)
    imp.fit(pd.concat([X, x_supp], ignore_index=True))
    return pd.DataFrame(imp.transform(X), columns=X.columns), pd.DataFrame(
        imp.transform(x_supp), columns=x_supp.columns)
Ejemplo n.º 13
0
    def missing_data_imputer(X: pd.DataFrame) -> pd.DataFrame:
        """ default n=5 for KNN Imputer """
        imputer = KNNImputer()
        imputer.fit(X)
        X_transform = imputer.transform(X)
        df_temp = pd.DataFrame(X_transform)
        df_temp.columns = X.columns

        return df_temp
def impute_missing(df, type='knn'):
    if type == 'knn':
        imputer = KNNImputer()
        imputer.fit(df)
    if type == 'iterative':
        imputer = IterativeImputer(random_state=0)
        imputer.fit(df)
    imputed_df = imputer.transform(df)
    df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)
    return df
def KNNimpute_DF(df):
    #filling in missing values with knn imputer
    imputer_knn = KNNImputer(n_neighbors=10)
    imputer_knn.fit(df)

    x = imputer_knn.transform(df)

    #casting the numpy array to dataframe
    df = pd.DataFrame(x)
    return df
Ejemplo n.º 16
0
def impute_knn():
    imp = KNNImputer(n_neighbors=2, weights="uniform")
    X_train = [[1, 2], [np.nan, 3], [7, 6]]
    imp.fit(X_train)
    X_test = [[np.nan, 2], [6, np.nan], [7, 6]]

    print("X_train")
    print(X_train)
    print("imputed X_test")
    print(imp.transform(X_test))
Ejemplo n.º 17
0
    def _get_imputer(self):
        """return KNN imputer for nan values

        Returns
        -------
        sklearn.impute.KNNImputer
        """
        imputer = KNNImputer(n_neighbors=2, weights="uniform")
        imputer.fit(self.X_train)

        return imputer
def predict(givencity):
    givencity = city_day[(city_day.AQI.notnull())
                         & (city_day.City == givencity)]
    #tell_me_null(givencity)

    corr = givencity.corr().AQI.sort_values(ascending=False)
    related = list(corr[corr > 0.6].index)
    #print(related)

    inter = givencity.loc[:, related].interpolate(method='linear')
    givencity.loc[:, related] = inter
    knn_imputer = KNNImputer(n_neighbors=3)

    imputing_cols = [
        'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
        'Benzene', 'Toluene', 'Xylene', 'AQI', 'B_X_O3_NH3',
        'ParticulateMatters'
    ]
    # we eliminated city, date, Year_Month and AQI_Bucket because
    # they either were unique or had numerical substitutes in other fields(AQI_bucket)

    knn_imputer.fit(givencity[imputing_cols])

    imputed = knn_imputer.transform(givencity[imputing_cols])

    #givencity.loc[:, imputing_cols] = imputed

    #tell_me_null(givencity)

    givencity_aqi = givencity[['Date', 'AQI']]
    givencity_aqi.reset_index(inplace=True, drop=True)

    train_df = givencity_aqi
    train_df.rename(mapper={'Date': 'ds', 'AQI': 'y'}, axis=1, inplace=True)
    train_df

    m = Prophet(holidays_prior_scale=0,
                seasonality_prior_scale=20,
                n_changepoints=50)

    m.fit(train_df)
    future = m.make_future_dataframe(periods=365)
    #future.tail()
    forecast = m.predict(future)
    forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

    #df_cv = cross_validation(m, initial='1100 days', period='121 days', horizon = '365 days')
    #df_p = performance_metrics(df_cv)
    #print('Cross Validation accuracy:', (1 - df_p['mape'].mean())*100)

    predictions_df = pd.DataFrame(forecast, columns=['ds', 'yhat'])

    return predictions_df, m
Ejemplo n.º 19
0
class KNNKeepDf(BaseEstimator, TransformerMixin):
    """KNN imputer, but returns DF and retains column names"""
    def __init__(self):
        self.colnames_ = []
        self.knn = KNNImputer()

    def fit(self, X, y=None):
        self.colnames_ = X.columns
        self.knn.fit(X)
        return self

    def transform(self, X, y=None, **fit_params):
        output = pd.DataFrame(self.knn.transform(X), columns=self.colnames_)
        return output
 def test_onnxt_knnimputer(self):
     x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13],
                            [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]],
                           dtype=numpy.float32)
     x_test = numpy.array(
         [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]],
         dtype=numpy.float32)
     kn = KNNImputer(n_neighbors=3, metric='nan_euclidean')
     kn.fit(x_train)
     model_def = to_onnx(kn, x_train)
     oinf = OnnxInference(model_def, runtime='python')
     got = oinf.run({'X': x_test})
     self.assertEqual(list(sorted(got)), ['variable'])
     self.assertEqualArray(kn.transform(x_test), got['variable'], decimal=6)
def remove_missing(df, missing_type=np.nan, nan_threshold=40, impute=False):
    missing_values = get_percentages(df, missing_type)
    df_features = missing_values[
        missing_values['percent_missing'] < nan_threshold].index.tolist()

    df = df[df_features]

    if impute:
        imputer = KNNImputer()
        imputer.fit(df)
        imputed_df = imputer.transform(df)
        df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)

    return df
Ejemplo n.º 22
0
def impute_regionidcity(train, validate, test):
    """
    This function does the following:
    1. Takes in the train, validate, and test datasets
    2. Creates the KNNImputer object
    3. Fits the object to the regionidcity feature in the train dataset
    4. Transforms the regionidcity feature in the train, validate, and test datasets
    """
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(train[["regionidcity"]])
    train["regionidcity"] = imputer.transform(train[["regionidcity"]])
    validate["regionidcity"] = imputer.transform(validate[["regionidcity"]])
    test["regionidcity"] = imputer.transform(test[["regionidcity"]])
    return imputer, train, validate, test
Ejemplo n.º 23
0
def sample_knn_prediction(matrix, test_data):
    """Returns knn prediction using sample of test_data"""
    matrix_c = np.copy(matrix.T)
    nbsr = KNNImputer(n_neighbors=11)
    idx = np.random.randint(542, size=542)
    mat1 = matrix[idx,:]
    nbsr.fit(mat1)
    mat_student = nbsr.transform(matrix)
    idx = np.random.randint(1774, size=1774)
    nbsr = KNNImputer(n_neighbors=21)
    mat2 = matrix_c[idx, :]
    nbsr.fit(mat2)
    mat_item = nbsr.transform(matrix_c).T
    mat_avg = (mat_item + mat_student)*0.5
    return sparse_matrix_predictions(test_data, mat_avg, threshold=0.5)
Ejemplo n.º 24
0
def MVKNN(f, g, x, col):
    st.text("KNN Imputer")

    from sklearn.impute import KNNImputer
    imp = KNNImputer(n_neighbors=2)
    ch6 = st.radio("Do you want to slice the table:", ("Yes", "No"), key=f)
    if ch6 == "Yes":
        try:
            col_sel = st.multiselect("Please select the columns", col, key=g)
            imputer = imp.fit(x[col_sel])
            x[col_sel] = imputer.transform(x[col_sel])
        except:
            st.info("Select atlest one column")

    else:
        imputer = imp.fit(x)
        x = imputer.transform(x)
    return x
Ejemplo n.º 25
0
def impute_values(df, imp_strategy, neighbors, numeric_vars):

    X = convert_to_numeric(df, numeric_vars)
    X = df[numeric_vars].to_numpy()
    other_vars = list(set(df.columns) - set(numeric_vars) )
    X_strings = df[other_vars].reset_index(drop=True)
    if imp_strategy == "knn":
        imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type
        imputed = imputer.fit_transform(X) # This is very costly
# from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
# https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
#         imputed = fast_knn(X, k= neighbors)
    else:
        imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy)
        imputer.fit(X)
        imputed = imputer.transform(X)
    X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars)
    rv = X_strings.join(X_imputed)
    return rv
    def transform(self, X):
      data = X.copy()

      df_str = data.select_dtypes(include=['object'])
      df_num = data.select_dtypes(include=['float'])

      impute_str = SimpleImputer(
          missing_values=np.nan,  # Los valores faltantes son de tipo ``np.nan`` (estandar Pandas)
          strategy='most_frequent',  # La estrategia escogida es reemplazar por una constante
          verbose=0,
          copy=True
      )

      impute_str.fit(X=df_str)

      df_str = pd.DataFrame.from_records(
          data=impute_str.transform(
              X=df_str
          ),  # El resultado SimpleImputer.transform(<<pandas dataframe>>) es una lista de listas
          columns=df_str.columns  # Las columnas originals deben ser conservadas en esta transformación
      )
      print(df_str.columns.values.tolist())
      # .mode()[0] - gives first category name
      # replace nan values with most occured category

      dum_df = pd.get_dummies(df_str, columns=df_str.columns.values.tolist() )
      df_str = df_str.drop(df_str.columns.values.tolist(), axis = 1)
      df_str = df_str.join(dum_df)

      impute_nums = KNNImputer( missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False)
      impute_nums.fit(X=df_num)

      df_num = pd.DataFrame.from_records(
          data=impute_nums.transform(
              X=df_num
          ),  # El resultado SimpleImputer.transform(<<pandas dataframe>>) es una lista de listas
          columns=df_num.columns  # Las columnas originals deben ser conservadas en esta transformación
      )

      df = df_num.join( df_str)    

      return df
    def imputer(self, method='knn'):
        '''
        Impute missing data to a missing data

        Parameters
        ----------
        column_name : string
            Name of the column to impute data

        Returns
        -------
        dataframe : DataFrame
            Return updated dataframe of the missing data from the column.
        imp : object
            imputer created with the data.

        '''
        print("Impute missing data using: " + method)
        # feature_type = self._feature_type_detector(column_name)
        # if feature_type == "class": strategy = "median"
        # if feature_type == "continuous": strategy = "mean"
        # if feature_type == "categorical": strategy = "most_frequent"

        # if method == 'simple': imp = SimpleImputer(strategy=strategy)
        # if method == 'iterative': imp = IterativeImputer(max_iter=10, initial_strategy=strategy)
        if method == 'knn':
            imp = KNNImputer(n_neighbors=5,
                             weights="uniform",
                             add_indicator=True)

        imp.fit(self.dataframe)
        transformed_data = imp.transform(self.dataframe)

        new_length_added = len(transformed_data[0]) - self.dataframe.shape[1]
        new_column_name = DataMethod.get_new_column_name(
            length_new_matrix=new_length_added, prefix="kNN_NaN_indicator")

        self.dataframe = pd.DataFrame(transformed_data,
                                      columns=list(self.dataframe.columns) +
                                      new_column_name)

        return self.dataframe, imp
def zillow_impute_city(df):
    df['haversine_distance'] = [
        haversine(x, y)
        for x, y in zip(df.latitude / 1000000, df.longitude / 1000000)
    ]
    knn_imputer = KNNImputer(n_neighbors=1)
    knn_imputer.fit(df[[
        'haversine_distance',
        'regionidcity',
    ]])
    c = pd.DataFrame(knn_imputer.transform(df[[
        'haversine_distance',
        'regionidcity',
    ]]),
                     columns=['haversine', 'regionid_city'],
                     index=df.parcelid)
    df = pd.merge(df, c, left_on='parcelid', right_on='parcelid')
    df = df.drop(columns=['haversine', 'regionidcity'])

    return df
Ejemplo n.º 29
0
def handleNull(df):
    
    col1, col2 = st.beta_columns(2)
    cat_data = df.select_dtypes(include=['object']).copy()
    col1.header("Categorical data: ")
    col1.write(cat_data.head())
    col1.write('Null values: ') 
    col1.write(cat_data.isna().sum())
    num_data = df.select_dtypes(include=['int64','float64']).copy()
    col2.header("Numerical data: ")
    col2.write(num_data.head())
    action = st.sidebar.selectbox( label="Select the action",
        options=['Handle null values', 'Handle outliers'])     
    
    if action == 'Handle null values':
        col2.write('Null values: ') 
        col2.write(num_data.isna().sum())
        imputer = KNNImputer(n_neighbors=4)
        imputer.fit(num_data)
        Xtrans=imputer.transform(num_data)
        st.write("Imputed values: ")
        st.dataframe(Xtrans)
    elif action == 'Handle outliers':
        st.sidebar.write("Outlier plot settings: ")
        x_val = st.sidebar.selectbox(label="Select x-axis value", options=non_numeric_columns)
        y_val = st.sidebar.selectbox(label="Select y-axis value", options=numeric_columns)
        colour = st.sidebar.selectbox(label="Select color value", options=non_numeric_columns)
        plot=px.box(df, x = x_val, y = y_val, color=colour)
        st.plotly_chart(plot)
        if st.button('Remove Outliers'):
            st.write(df.shape)
            rowNums = []
            for column in num_data:
                med = num_data[column].median()
                List=abs(num_data-med)
                cond=List.median()*4.5
                num_data[column] = List[~(List>cond)]

            st.write("Modified dataset")
            st.dataframe(num_data)
            st.write(num_data.shape)
Ejemplo n.º 30
0
def zillow_impute(df):
    df.heatingorsystemdesc = df.heatingorsystemdesc.fillna('None')
    df.heatingorsystemtypeid = df.heatingorsystemtypeid.fillna(13)
    df.buildingqualitytypeid = df.buildingqualitytypeid.fillna(
        df.buildingqualitytypeid.median())
    df = df.drop(
        columns=['calculatedbathnbr', 'propertyzoningdesc', 'unitcnt'])
    df['haversine_distance'] = [
        haversine(x, y)
        for x, y in zip(df.latitude / 1000000, df.longitude / 1000000)
    ]
    knn_imputer = KNNImputer(n_neighbors=1)
    knn_imputer.fit(df[['haversine_distance', 'regionidcity']])
    c = pd.DataFrame(knn_imputer.transform(
        df[['haversine_distance', 'regionidcity']]),
                     columns=['haversine', 'regionid_city'],
                     index=df.parcelid)
    df = pd.merge(df, c, left_on='parcelid', right_on='parcelid')
    df = df.drop(columns=['haversine', 'regionidcity'])
    df = df.dropna()
    return df