コード例 #1
0
ファイル: knn.py プロジェクト: georgSquared/ML-Assignments
def fill_data(train_data, test_data):
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputer.fit(train_data)
    train = imputer.transform(train_data)
    test = imputer.transform(test_data)

    return train, test
コード例 #2
0
ファイル: Imputation.py プロジェクト: LukeAndrewSmith/AML
def knn2(X, x_supp, neighbors=1):
    if x_supp is not None:
        x_supp.columns = X.columns
    imp = KNNImputer(missing_values=np.nan,
                     weights='distance',
                     n_neighbors=neighbors)
    imp.fit(pd.concat([X, x_supp], ignore_index=True))
    return pd.DataFrame(imp.transform(X), columns=X.columns), pd.DataFrame(
        imp.transform(x_supp), columns=x_supp.columns)
コード例 #3
0
def fillMissingValues(trainx_df,testx_df):
    imputer = KNNImputer(n_neighbors=2)
    imputer.fit(trainx_df)
    trainx_df_filled = imputer.transform(trainx_df)
    trainx_df_filled=pd.DataFrame(trainx_df_filled,columns=trainx_df.columns)
    testx_df_filled = imputer.transform(testx_df)
    testx_df_filled=pd.DataFrame(testx_df_filled,columns=testx_df.columns)
    testx_df_filled.reset_index(drop=True,inplace=True)
    return trainx_df_filled,testx_df_filled
コード例 #4
0
def impute_regionidcity(train, validate, test):
    """
    This function does the following:
    1. Takes in the train, validate, and test datasets
    2. Creates the KNNImputer object
    3. Fits the object to the regionidcity feature in the train dataset
    4. Transforms the regionidcity feature in the train, validate, and test datasets
    """
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(train[["regionidcity"]])
    train["regionidcity"] = imputer.transform(train[["regionidcity"]])
    validate["regionidcity"] = imputer.transform(validate[["regionidcity"]])
    test["regionidcity"] = imputer.transform(test[["regionidcity"]])
    return imputer, train, validate, test
コード例 #5
0
ファイル: ensemble.py プロジェクト: anind99/ML-Models
def sample_knn_prediction(matrix, test_data):
    """Returns knn prediction using sample of test_data"""
    matrix_c = np.copy(matrix.T)
    nbsr = KNNImputer(n_neighbors=11)
    idx = np.random.randint(542, size=542)
    mat1 = matrix[idx,:]
    nbsr.fit(mat1)
    mat_student = nbsr.transform(matrix)
    idx = np.random.randint(1774, size=1774)
    nbsr = KNNImputer(n_neighbors=21)
    mat2 = matrix_c[idx, :]
    nbsr.fit(mat2)
    mat_item = nbsr.transform(matrix_c).T
    mat_avg = (mat_item + mat_student)*0.5
    return sparse_matrix_predictions(test_data, mat_avg, threshold=0.5)
コード例 #6
0
class KNNReplacerIQR(KNNImputer):
    """Pipeline-compliant KNNReplacer, based on IQR."""
    def __init__(self, n_neighbors=5):
        super().__init__(n_neighbors=n_neighbors)
        self.lower_bound = None
        self.upper_bound = None
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, x, y=None):
        """Computes IQR bound and fits the imputer on the data."""
        x = pd.DataFrame(x)
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        self.lower_bound = q1 - (1.5 * iqr)
        self.upper_bound = q3 + (1.5 * iqr)
        self.imputer.fit(
            x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                    np.nan))
        return self

    def transform(self, x, y=None):
        """Detects outliers and replaces them with the imputer."""
        x = pd.DataFrame(x)
        x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                np.nan,
                inplace=True)
        return self.imputer.transform(x)
コード例 #7
0
ファイル: Data.py プロジェクト: RecklessCrow/PulsarGore
class Data:
    def __init__(self):
        self.label_encoder = OneHotEncoder(sparse=False)
        self.imputer = KNNImputer()
        self.scaler = RobustScaler()

        train = pd.read_csv('data/train_data.csv')
        self.X_train = train.iloc[:, :-1]
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.Y_train = np.array(train['target_class']).reshape(-1, 1)
        self.Y_train = self.label_encoder.fit_transform(self.Y_train)

        test = pd.read_csv('data/test_data.csv')
        self.X_test = test.iloc[:, :-1]
        self.X_test = self.imputer.transform(self.X_test)
        self.X_test = self.scaler.transform(self.X_test)
        self.Y_test = np.array(test['target_class']).reshape(-1, 1)
        self.Y_test = self.label_encoder.transform(self.Y_test)

    def get_training_data(self):
        return self.X_train, self.Y_train

    def get_test_data(self):
        return self.X_test, self.Y_test
コード例 #8
0
 def imputeData(self, X, imputerModel=None):
     if imputerModel is None:
         imputerModel = KNNImputer()
         imputerModel.fit(X)
     imputedData = imputerModel.transform(X)
     X_imp = pd.DataFrame(imputedData, columns=X.columns)
     return (X_imp, imputerModel)
コード例 #9
0
ファイル: experiment_def.py プロジェクト: cschreck/dt-eig
def experiment_setting_2(X, y, runs=5, missingness=0.1):
    results = []
    for i in range(runs):
        np.random.seed(i)
        X_missing = make_missing_random(X, missingness)

        ss = StratifiedKFold(shuffle=True, random_state=i)

        for train_index, test_index in ss.split(X, y):
            X_train = X_missing[train_index]
            y_train = y[train_index]
            imputer = KNNImputer()
            imputer.fit(X_train)
            X_test = imputer.transform(X_missing[test_index])
            y_test = y[test_index]

            knnimp = KNNImputer()
            X_knn_full_imputed = knnimp.fit_transform(X_train)
            X_train_imputed = np.ones(X_train.shape) * np.nan
            for idx in np.argwhere(np.isnan(X_train)):
                X_train_imputed[idx[0], idx[1]] = X_knn_full_imputed[idx[0],
                                                                     idx[1]]

            hdt = EIGDecisionTree(max_depth=20)
            hdt.fit(X_train, X_train_imputed, y_train)
            results.append(accuracy_score(hdt.predict(X_test), y_test))
            #print(get_depth(hdt.tree), get_size(hdt.tree))

    return results
コード例 #10
0
ファイル: test_knn.py プロジェクト: Aathi410/Pro123
def test_knn_imputer_removes_all_na_features(na):
    X = np.array([
        [1, 1, na, 1, 1, 1.0],
        [2, 3, na, 2, 2, 2],
        [3, 4, na, 3, 3, na],
        [6, 4, na, na, 6, 6],
    ])
    knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)

    X_transform = knn.transform(X)
    assert not np.isnan(X_transform).any()
    assert X_transform.shape == (4, 5)

    X_test = np.arange(0, 12).reshape(2, 6)
    X_transform = knn.transform(X_test)
    assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
コード例 #11
0
ファイル: machine.py プロジェクト: satyender765/streamlit
def mvt_knn(df):
    try:

        st.info("The Percenatge of Value Missing in Given Data is : {:.2f}%".
                format(((df.isna().sum().sum()) / (df.count().sum()) * 100)))
        num_col = list(df.select_dtypes(include='float64').columns)
        knn = KNNImputer(n_neighbors=1, add_indicator=True)
        knn.fit(df[num_col])
        knn_impute = pd.DataFrame(knn.transform(df[num_col]))
        df[num_col] = knn_impute.iloc[:, :df[num_col].shape[1]]
        clean_df = df
        clean_df = (df.fillna(df.mode().iloc[0]))
        st.dataframe(clean_df)
        st.write("\nEmpty rows  after imputing the data: \n",
                 clean_df.isnull().sum())
        st.info("Numerical data : {}".format(list(dict(df.median()).keys())))
        st.info("Categorical data : {}".format(
            list(df.select_dtypes(include='object').mode())))
        st.write('Shape of dataframe (Rows, Columns): ', df.shape)
        st.write('Data description : ', df.describe())
        st.line_chart(clean_df)
        st.info(
            "Only Numerical Data is treated using K-NN Method , Categorical Data is trreated using Mode"
        )
        return clean_df

    except Exception as e:
        st.write("Oops!", e.__class__, "occurred.")
        return df
コード例 #12
0
def handleNull():
    st.write(df.head())
    col1, col2 = st.beta_columns(2)
    cat_data = df.select_dtypes(include=['object']).copy()
    col1.header("Categorical data: ")
    col1.write(cat_data.head())
    col1.write('Null values: ') 
    col1.write(cat_data.isna().sum())
    num_data = df.select_dtypes(include=['int64','float64']).copy()
    col2.header("Numerical data: ")
    col2.write(num_data.head())
    action = st.sidebar.selectbox( label="Select the action",
        options=['Handle null values', 'Handle outliers'])     
    
    if action == 'Handle null values':
        col2.write('Null values: ') 
        col2.write(num_data.isna().sum())
        imputer = KNNImputer(n_neighbors=4)
        imputer.fit(num_data)
        Xtrans=imputer.transform(num_data)
        st.write("Imputed values: ")
        st.dataframe(Xtrans)
    elif action == 'Handle outliers':
        
        outliers = []
        
        for (columnName, columnData) in num_data.iteritems(): 
            z=np.abs(stats.zscore(columnData.values))   
            outliers.append(np.where(z>3))
        st.write(outliers)
コード例 #13
0
ファイル: imputation.py プロジェクト: landiisotta/NDAR_data
def impute(train_df, test_df):
    """
    Function that perform missing data imputation
    on both train and test for a unique interview period.

    Parameters
    ----------
    train_df: dataframe feature names and interview-based names
    test_df: dataframe feature names and interview-based names
    Returns
    ------
    imputed dataframe train
    imputed dataframe test
    """
    knnimpute = KNNImputer(n_neighbors=ut.neighbors)
    col_n = [nc for nc in train_df.columns if not re.search('interview', nc)]
    col_out = [nc for nc in train_df.columns if re.search('interview', nc)]
    tmp_tr = pd.DataFrame(knnimpute.fit_transform(train_df[col_n]),
                          columns=col_n)
    tmp_ts = pd.DataFrame(knnimpute.transform(test_df[col_n]), columns=col_n)
    tmp_tr.index = train_df.index
    tmp_ts.index = test_df.index
    for c in col_out:
        tmp_tr[c] = train_df[c]
        tmp_ts[c] = test_df[c]
    return tmp_tr, tmp_ts
コード例 #14
0
class FeatureExtractor(BaseEstimator):
    def __init__(self, imputer_neighbors: int = 5):
        self.imputer = KNNImputer(n_neighbors=imputer_neighbors)
        self.cat_cols = None
        self.num_cols = None

    def fit(self, X, y=None):
        # convert categorical columns to categorical type
        self.cat_cols = [
            column_name for column_name in X.columns
            if str(X[column_name].dtype) == 'object'
        ]
        self.num_cols = [
            column_name for column_name in X.columns
            if column_name not in self.cat_cols
        ]
        X[self.cat_cols] = X[self.cat_cols].astype('category')

        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        # fit imputer
        self.imputer.fit(X_dummy)

    def transform(self, X):
        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        X_dummy = pd.DataFrame(self.imputer.transform(X_dummy.values),
                               columns=X_dummy.columns)

        # revert dummification
        for col in self.cat_cols:
            X_dummy[col] = X_dummy.loc[:,
                                       X_dummy.columns.str.
                                       startswith(col)].idxmax(
                                           axis=1).str.replace(col + "_", '')
            X_dummy = X_dummy.loc[:,
                                  ~X_dummy.columns.str.startswith(col + "_")]

        # reset categorical column types
        X_dummy[self.cat_cols] = X_dummy[self.cat_cols].astype('category')

        # simplify pdays & previous
        X_dummy.pdays = np.where(X_dummy.pdays != 999., 1, 0)
        X_dummy.previous = np.where(X_dummy.previous >= 1., 1, 0)
        X_dummy.drop(columns=['previous','loan'], inplace=True)

        return X_dummy
コード例 #15
0
def perform_imputation(X, imputer=None):
    X_feat_list = X.columns
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
        imputer.fit(X)
    np_array = imputer.transform(X)
    X = pd.DataFrame(np_array, columns=X_feat_list)
    return X, imputer
コード例 #16
0
    def missing_data_imputer(X: pd.DataFrame) -> pd.DataFrame:
        """ default n=5 for KNN Imputer """
        imputer = KNNImputer()
        imputer.fit(X)
        X_transform = imputer.transform(X)
        df_temp = pd.DataFrame(X_transform)
        df_temp.columns = X.columns

        return df_temp
コード例 #17
0
ファイル: test_knn.py プロジェクト: Aathi410/Pro123
def test_knn_imputer_drops_all_nan_features(na):
    X1 = np.array([[na, 1], [na, 2]])
    knn = KNNImputer(missing_values=na, n_neighbors=1)
    X1_expected = np.array([[1], [2]])
    assert_allclose(knn.fit_transform(X1), X1_expected)

    X2 = np.array([[1, 2], [3, na]])
    X2_expected = np.array([[2], [1.5]])
    assert_allclose(knn.transform(X2), X2_expected)
コード例 #18
0
def impute_knn():
    imp = KNNImputer(n_neighbors=2, weights="uniform")
    X_train = [[1, 2], [np.nan, 3], [7, 6]]
    imp.fit(X_train)
    X_test = [[np.nan, 2], [6, np.nan], [7, 6]]

    print("X_train")
    print(X_train)
    print("imputed X_test")
    print(imp.transform(X_test))
コード例 #19
0
def KNNimpute_DF(df):
    #filling in missing values with knn imputer
    imputer_knn = KNNImputer(n_neighbors=10)
    imputer_knn.fit(df)

    x = imputer_knn.transform(df)

    #casting the numpy array to dataframe
    df = pd.DataFrame(x)
    return df
コード例 #20
0
def impute_missing(df, type='knn'):
    if type == 'knn':
        imputer = KNNImputer()
        imputer.fit(df)
    if type == 'iterative':
        imputer = IterativeImputer(random_state=0)
        imputer.fit(df)
    imputed_df = imputer.transform(df)
    df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)
    return df
コード例 #21
0
ファイル: test_knn.py プロジェクト: Aathi410/Pro123
def test_knn_imputer_not_enough_valid_distances(na, weights):
    # Samples with needed feature has nan distance
    X1 = np.array([[na, 11], [na, 1], [3, na]])
    X1_imputed = np.array([[3, 11], [3, 1], [3, 6]])

    knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
    assert_allclose(knn.fit_transform(X1), X1_imputed)

    X2 = np.array([[4, na]])
    X2_imputed = np.array([[4, 6]])
    assert_allclose(knn.transform(X2), X2_imputed)
コード例 #22
0
def predict(givencity):
    givencity = city_day[(city_day.AQI.notnull())
                         & (city_day.City == givencity)]
    #tell_me_null(givencity)

    corr = givencity.corr().AQI.sort_values(ascending=False)
    related = list(corr[corr > 0.6].index)
    #print(related)

    inter = givencity.loc[:, related].interpolate(method='linear')
    givencity.loc[:, related] = inter
    knn_imputer = KNNImputer(n_neighbors=3)

    imputing_cols = [
        'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
        'Benzene', 'Toluene', 'Xylene', 'AQI', 'B_X_O3_NH3',
        'ParticulateMatters'
    ]
    # we eliminated city, date, Year_Month and AQI_Bucket because
    # they either were unique or had numerical substitutes in other fields(AQI_bucket)

    knn_imputer.fit(givencity[imputing_cols])

    imputed = knn_imputer.transform(givencity[imputing_cols])

    #givencity.loc[:, imputing_cols] = imputed

    #tell_me_null(givencity)

    givencity_aqi = givencity[['Date', 'AQI']]
    givencity_aqi.reset_index(inplace=True, drop=True)

    train_df = givencity_aqi
    train_df.rename(mapper={'Date': 'ds', 'AQI': 'y'}, axis=1, inplace=True)
    train_df

    m = Prophet(holidays_prior_scale=0,
                seasonality_prior_scale=20,
                n_changepoints=50)

    m.fit(train_df)
    future = m.make_future_dataframe(periods=365)
    #future.tail()
    forecast = m.predict(future)
    forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

    #df_cv = cross_validation(m, initial='1100 days', period='121 days', horizon = '365 days')
    #df_p = performance_metrics(df_cv)
    #print('Cross Validation accuracy:', (1 - df_p['mape'].mean())*100)

    predictions_df = pd.DataFrame(forecast, columns=['ds', 'yhat'])

    return predictions_df, m
コード例 #23
0
def imputation(train_data, test_data):
    '''In order to substitute the NaN values rather then delete them, a kNN imputer function is
    used to impute the missing data. This function is based on the train set and subsequently
    applied on the test set. This ensures the model is completely trained on the train set rather
    than the test set. The inputs are the trainset and the testset, the outputs are the same sets
    with imputed values.
    '''
    # impute the still existing NaN's
    imputer = KNNImputer(n_neighbors=3, weights="uniform")
    imputed_train = imputer.fit_transform(train_data)
    imputed_test = imputer.transform(test_data)

    return imputed_train, imputed_test
コード例 #24
0
def remove_missing(df, missing_type=np.nan, nan_threshold=40, impute=False):
    missing_values = get_percentages(df, missing_type)
    df_features = missing_values[
        missing_values['percent_missing'] < nan_threshold].index.tolist()

    df = df[df_features]

    if impute:
        imputer = KNNImputer()
        imputer.fit(df)
        imputed_df = imputer.transform(df)
        df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)

    return df
コード例 #25
0
 def test_onnxt_knnimputer(self):
     x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13],
                            [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]],
                           dtype=numpy.float32)
     x_test = numpy.array(
         [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]],
         dtype=numpy.float32)
     kn = KNNImputer(n_neighbors=3, metric='nan_euclidean')
     kn.fit(x_train)
     model_def = to_onnx(kn, x_train)
     oinf = OnnxInference(model_def, runtime='python')
     got = oinf.run({'X': x_test})
     self.assertEqual(list(sorted(got)), ['variable'])
     self.assertEqualArray(kn.transform(x_test), got['variable'], decimal=6)
コード例 #26
0
class KNNKeepDf(BaseEstimator, TransformerMixin):
    """KNN imputer, but returns DF and retains column names"""
    def __init__(self):
        self.colnames_ = []
        self.knn = KNNImputer()

    def fit(self, X, y=None):
        self.colnames_ = X.columns
        self.knn.fit(X)
        return self

    def transform(self, X, y=None, **fit_params):
        output = pd.DataFrame(self.knn.transform(X), columns=self.colnames_)
        return output
コード例 #27
0
def preprocess_with_knn_imputer_minmax_scaler(
    train_data: np.ndarray,
    test_data: np.ndarray,
    n_neighbors: int = 5,
) -> Tuple[np.ndarray, np.ndarray]:
    imputer = KNNImputer(n_neighbors=n_neighbors)
    train_data_without_nans = imputer.fit_transform(train_data)
    test_data_without_nans = imputer.transform(test_data)

    min_max_scaler = MinMaxScaler()
    train_data_without_nans_scaled = min_max_scaler.fit_transform(
        train_data_without_nans)
    test_data_without_nans_scaled = min_max_scaler.transform(
        test_data_without_nans)

    return train_data_without_nans_scaled, test_data_without_nans_scaled
コード例 #28
0
def impute_values(df, imp_strategy, neighbors, numeric_vars):

    X = convert_to_numeric(df, numeric_vars)
    X = df[numeric_vars].to_numpy()
    other_vars = list(set(df.columns) - set(numeric_vars) )
    X_strings = df[other_vars].reset_index(drop=True)
    if imp_strategy == "knn":
        imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type
        imputed = imputer.fit_transform(X) # This is very costly
# from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
# https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
#         imputed = fast_knn(X, k= neighbors)
    else:
        imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy)
        imputer.fit(X)
        imputed = imputer.transform(X)
    X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars)
    rv = X_strings.join(X_imputed)
    return rv
コード例 #29
0
def naKNN(train_x, test_x):
    """
    Sostituisce i valori mancanti nel training set e nel test set con KNNImputer().
    :param train_x: training set
    :param test_x: test set
    :return: None
    """
    getNaCount(train_x)  # calcola il numero di NaN per il training set
    imputer = KNNImputer(n_neighbors=3)

    imputed_train = imputer.fit_transform(train_x.data)
    train_x.data = pd.DataFrame(imputed_train, columns=train_x.data.columns)
    save_object(
        imputer, 'imputer.pkl'
    )  # salva imputer nel file 'imputer.pkl' (serve successivamente per il test finale)

    if test_x is not None:
        imputed_test = imputer.transform(test_x.data)
        test_x.data = pd.DataFrame(imputed_test, columns=test_x.data.columns)
コード例 #30
0
def cv_preprocessing(X_train, X_test=None, random_state=None):
    variables_path = r"variables.json"
    with open(variables_path) as f:
        variables = json.load(f)
        t1_features, cogni = variables['t1_features'], variables['cogni']
        pcl = variables['questionnaires']['PCL'][:17]

    mice = KNNImputer()
    columns = X_train.columns
    X_train = pd.DataFrame(mice.fit_transform(X_train), columns=columns)

    #X_train = stds(X_train)
    #X_train = stats(X_train)
    #X_train = removal_correlated(X_train)
    # ss = StandardScaler()
    # X_train = ss.fit_transform(X_train)
    # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns)
    if X_test is not None:
        X_test = pd.DataFrame(mice.transform(X_test), columns=columns)
        #X_test = stds(X_test)
        #X_test = stats(X_train, X_test)
        #_, X_test = removal_correlated(X_train, X_test)
        # X_test = ss.transform(X_test)
        # X_test = pd.DataFrame(ss.transform(X_test), columns=columns)

        X_train, X_test = outliers(
            X_train,
            X_test,
            features=[f"T1q5.{i}" for i in range(1, 10)],
            name='phq9')
        #X_train, X_test = outliers(X_train, X_test, features=pcl, name='PCL')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=cogni,
                                   name='cogni')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=t1_features,
                                   name='t1')

        return X_train, X_test
    else:
        return X_train