Python KNNImputer.transform Examples, sklearn.impute.KNNImputer.transform Python Examples

Example #1

0

Show file

File: knn.py Project: georgSquared/ML-Assignments

def fill_data(train_data, test_data):
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputer.fit(train_data)
    train = imputer.transform(train_data)
    test = imputer.transform(test_data)

    return train, test

Example #2

0

Show file

File: Imputation.py Project: LukeAndrewSmith/AML

def knn2(X, x_supp, neighbors=1):
    if x_supp is not None:
        x_supp.columns = X.columns
    imp = KNNImputer(missing_values=np.nan,
                     weights='distance',
                     n_neighbors=neighbors)
    imp.fit(pd.concat([X, x_supp], ignore_index=True))
    return pd.DataFrame(imp.transform(X), columns=X.columns), pd.DataFrame(
        imp.transform(x_supp), columns=x_supp.columns)

Example #3

0

Show file

def fillMissingValues(trainx_df,testx_df):
    imputer = KNNImputer(n_neighbors=2)
    imputer.fit(trainx_df)
    trainx_df_filled = imputer.transform(trainx_df)
    trainx_df_filled=pd.DataFrame(trainx_df_filled,columns=trainx_df.columns)
    testx_df_filled = imputer.transform(testx_df)
    testx_df_filled=pd.DataFrame(testx_df_filled,columns=testx_df.columns)
    testx_df_filled.reset_index(drop=True,inplace=True)
    return trainx_df_filled,testx_df_filled

Example #4

0

Show file

def impute_regionidcity(train, validate, test):
    """
    This function does the following:
    1. Takes in the train, validate, and test datasets
    2. Creates the KNNImputer object
    3. Fits the object to the regionidcity feature in the train dataset
    4. Transforms the regionidcity feature in the train, validate, and test datasets
    """
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(train[["regionidcity"]])
    train["regionidcity"] = imputer.transform(train[["regionidcity"]])
    validate["regionidcity"] = imputer.transform(validate[["regionidcity"]])
    test["regionidcity"] = imputer.transform(test[["regionidcity"]])
    return imputer, train, validate, test

Example #5

0

Show file

File: ensemble.py Project: anind99/ML-Models

def sample_knn_prediction(matrix, test_data):
    """Returns knn prediction using sample of test_data"""
    matrix_c = np.copy(matrix.T)
    nbsr = KNNImputer(n_neighbors=11)
    idx = np.random.randint(542, size=542)
    mat1 = matrix[idx,:]
    nbsr.fit(mat1)
    mat_student = nbsr.transform(matrix)
    idx = np.random.randint(1774, size=1774)
    nbsr = KNNImputer(n_neighbors=21)
    mat2 = matrix_c[idx, :]
    nbsr.fit(mat2)
    mat_item = nbsr.transform(matrix_c).T
    mat_avg = (mat_item + mat_student)*0.5
    return sparse_matrix_predictions(test_data, mat_avg, threshold=0.5)

Example #6

0

Show file

class KNNReplacerIQR(KNNImputer):
    """Pipeline-compliant KNNReplacer, based on IQR."""
    def __init__(self, n_neighbors=5):
        super().__init__(n_neighbors=n_neighbors)
        self.lower_bound = None
        self.upper_bound = None
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, x, y=None):
        """Computes IQR bound and fits the imputer on the data."""
        x = pd.DataFrame(x)
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        self.lower_bound = q1 - (1.5 * iqr)
        self.upper_bound = q3 + (1.5 * iqr)
        self.imputer.fit(
            x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                    np.nan))
        return self

    def transform(self, x, y=None):
        """Detects outliers and replaces them with the imputer."""
        x = pd.DataFrame(x)
        x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                np.nan,
                inplace=True)
        return self.imputer.transform(x)

Example #7

0

Show file

File: Data.py Project: RecklessCrow/PulsarGore

class Data:
    def __init__(self):
        self.label_encoder = OneHotEncoder(sparse=False)
        self.imputer = KNNImputer()
        self.scaler = RobustScaler()

        train = pd.read_csv('data/train_data.csv')
        self.X_train = train.iloc[:, :-1]
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.Y_train = np.array(train['target_class']).reshape(-1, 1)
        self.Y_train = self.label_encoder.fit_transform(self.Y_train)

        test = pd.read_csv('data/test_data.csv')
        self.X_test = test.iloc[:, :-1]
        self.X_test = self.imputer.transform(self.X_test)
        self.X_test = self.scaler.transform(self.X_test)
        self.Y_test = np.array(test['target_class']).reshape(-1, 1)
        self.Y_test = self.label_encoder.transform(self.Y_test)

    def get_training_data(self):
        return self.X_train, self.Y_train

    def get_test_data(self):
        return self.X_test, self.Y_test

Example #8

0

Show file

File: modelling.py Project: sinhars/Kaggle-Stroke-Prediction

 def imputeData(self, X, imputerModel=None):
     if imputerModel is None:
         imputerModel = KNNImputer()
         imputerModel.fit(X)
     imputedData = imputerModel.transform(X)
     X_imp = pd.DataFrame(imputedData, columns=X.columns)
     return (X_imp, imputerModel)

Example #9

0

Show file

File: experiment_def.py Project: cschreck/dt-eig

def experiment_setting_2(X, y, runs=5, missingness=0.1):
    results = []
    for i in range(runs):
        np.random.seed(i)
        X_missing = make_missing_random(X, missingness)

        ss = StratifiedKFold(shuffle=True, random_state=i)

        for train_index, test_index in ss.split(X, y):
            X_train = X_missing[train_index]
            y_train = y[train_index]
            imputer = KNNImputer()
            imputer.fit(X_train)
            X_test = imputer.transform(X_missing[test_index])
            y_test = y[test_index]

            knnimp = KNNImputer()
            X_knn_full_imputed = knnimp.fit_transform(X_train)
            X_train_imputed = np.ones(X_train.shape) * np.nan
            for idx in np.argwhere(np.isnan(X_train)):
                X_train_imputed[idx[0], idx[1]] = X_knn_full_imputed[idx[0],
                                                                     idx[1]]

            hdt = EIGDecisionTree(max_depth=20)
            hdt.fit(X_train, X_train_imputed, y_train)
            results.append(accuracy_score(hdt.predict(X_test), y_test))
            #print(get_depth(hdt.tree), get_size(hdt.tree))

    return results

Example #10

0

Show file

File: test_knn.py Project: Aathi410/Pro123

def test_knn_imputer_removes_all_na_features(na):
    X = np.array([
        [1, 1, na, 1, 1, 1.0],
        [2, 3, na, 2, 2, 2],
        [3, 4, na, 3, 3, na],
        [6, 4, na, na, 6, 6],
    ])
    knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)

    X_transform = knn.transform(X)
    assert not np.isnan(X_transform).any()
    assert X_transform.shape == (4, 5)

    X_test = np.arange(0, 12).reshape(2, 6)
    X_transform = knn.transform(X_test)
    assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)

Example #11

0

Show file

File: machine.py Project: satyender765/streamlit

def mvt_knn(df):
    try:

        st.info("The Percenatge of Value Missing in Given Data is : {:.2f}%".
                format(((df.isna().sum().sum()) / (df.count().sum()) * 100)))
        num_col = list(df.select_dtypes(include='float64').columns)
        knn = KNNImputer(n_neighbors=1, add_indicator=True)
        knn.fit(df[num_col])
        knn_impute = pd.DataFrame(knn.transform(df[num_col]))
        df[num_col] = knn_impute.iloc[:, :df[num_col].shape[1]]
        clean_df = df
        clean_df = (df.fillna(df.mode().iloc[0]))
        st.dataframe(clean_df)
        st.write("\nEmpty rows  after imputing the data: \n",
                 clean_df.isnull().sum())
        st.info("Numerical data : {}".format(list(dict(df.median()).keys())))
        st.info("Categorical data : {}".format(
            list(df.select_dtypes(include='object').mode())))
        st.write('Shape of dataframe (Rows, Columns): ', df.shape)
        st.write('Data description : ', df.describe())
        st.line_chart(clean_df)
        st.info(
            "Only Numerical Data is treated using K-NN Method , Categorical Data is trreated using Mode"
        )
        return clean_df

    except Exception as e:
        st.write("Oops!", e.__class__, "occurred.")
        return df

Example #12

0

Show file

def handleNull():
    st.write(df.head())
    col1, col2 = st.beta_columns(2)
    cat_data = df.select_dtypes(include=['object']).copy()
    col1.header("Categorical data: ")
    col1.write(cat_data.head())
    col1.write('Null values: ') 
    col1.write(cat_data.isna().sum())
    num_data = df.select_dtypes(include=['int64','float64']).copy()
    col2.header("Numerical data: ")
    col2.write(num_data.head())
    action = st.sidebar.selectbox( label="Select the action",
        options=['Handle null values', 'Handle outliers'])     
    
    if action == 'Handle null values':
        col2.write('Null values: ') 
        col2.write(num_data.isna().sum())
        imputer = KNNImputer(n_neighbors=4)
        imputer.fit(num_data)
        Xtrans=imputer.transform(num_data)
        st.write("Imputed values: ")
        st.dataframe(Xtrans)
    elif action == 'Handle outliers':
        
        outliers = []
        
        for (columnName, columnData) in num_data.iteritems(): 
            z=np.abs(stats.zscore(columnData.values))   
            outliers.append(np.where(z>3))
        st.write(outliers)

Example #13

0

Show file

File: imputation.py Project: landiisotta/NDAR_data

def impute(train_df, test_df):
    """
    Function that perform missing data imputation
    on both train and test for a unique interview period.

    Parameters
    ----------
    train_df: dataframe feature names and interview-based names
    test_df: dataframe feature names and interview-based names
    Returns
    ------
    imputed dataframe train
    imputed dataframe test
    """
    knnimpute = KNNImputer(n_neighbors=ut.neighbors)
    col_n = [nc for nc in train_df.columns if not re.search('interview', nc)]
    col_out = [nc for nc in train_df.columns if re.search('interview', nc)]
    tmp_tr = pd.DataFrame(knnimpute.fit_transform(train_df[col_n]),
                          columns=col_n)
    tmp_ts = pd.DataFrame(knnimpute.transform(test_df[col_n]), columns=col_n)
    tmp_tr.index = train_df.index
    tmp_ts.index = test_df.index
    for c in col_out:
        tmp_tr[c] = train_df[c]
        tmp_ts[c] = test_df[c]
    return tmp_tr, tmp_ts

Example #14

0

Show file

class FeatureExtractor(BaseEstimator):
    def __init__(self, imputer_neighbors: int = 5):
        self.imputer = KNNImputer(n_neighbors=imputer_neighbors)
        self.cat_cols = None
        self.num_cols = None

    def fit(self, X, y=None):
        # convert categorical columns to categorical type
        self.cat_cols = [
            column_name for column_name in X.columns
            if str(X[column_name].dtype) == 'object'
        ]
        self.num_cols = [
            column_name for column_name in X.columns
            if column_name not in self.cat_cols
        ]
        X[self.cat_cols] = X[self.cat_cols].astype('category')

        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        # fit imputer
        self.imputer.fit(X_dummy)

    def transform(self, X):
        # one hot encode to be able to use KNNImputation
        X_dummy = X.copy()
        X_dummy = pd.get_dummies(X, dummy_na=True)
        for col in self.cat_cols:
            X_dummy.loc[X_dummy[col + "_nan"] == 1,
                        X_dummy.columns.str.startswith(col)] = np.nan
            del X_dummy[col + "_nan"]

        X_dummy = pd.DataFrame(self.imputer.transform(X_dummy.values),
                               columns=X_dummy.columns)

        # revert dummification
        for col in self.cat_cols:
            X_dummy[col] = X_dummy.loc[:,
                                       X_dummy.columns.str.
                                       startswith(col)].idxmax(
                                           axis=1).str.replace(col + "_", '')
            X_dummy = X_dummy.loc[:,
                                  ~X_dummy.columns.str.startswith(col + "_")]

        # reset categorical column types
        X_dummy[self.cat_cols] = X_dummy[self.cat_cols].astype('category')

        # simplify pdays & previous
        X_dummy.pdays = np.where(X_dummy.pdays != 999., 1, 0)
        X_dummy.previous = np.where(X_dummy.previous >= 1., 1, 0)
        X_dummy.drop(columns=['previous','loan'], inplace=True)

        return X_dummy

Example #15

0

Show file

def perform_imputation(X, imputer=None):
    X_feat_list = X.columns
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
        imputer.fit(X)
    np_array = imputer.transform(X)
    X = pd.DataFrame(np_array, columns=X_feat_list)
    return X, imputer

Example #16

0

Show file

File: _preprocess.py Project: twlkenneth/EBAC_Sem2_CA1

    def missing_data_imputer(X: pd.DataFrame) -> pd.DataFrame:
        """ default n=5 for KNN Imputer """
        imputer = KNNImputer()
        imputer.fit(X)
        X_transform = imputer.transform(X)
        df_temp = pd.DataFrame(X_transform)
        df_temp.columns = X.columns

        return df_temp

Example #17

0

Show file

File: test_knn.py Project: Aathi410/Pro123

def test_knn_imputer_drops_all_nan_features(na):
    X1 = np.array([[na, 1], [na, 2]])
    knn = KNNImputer(missing_values=na, n_neighbors=1)
    X1_expected = np.array([[1], [2]])
    assert_allclose(knn.fit_transform(X1), X1_expected)

    X2 = np.array([[1, 2], [3, na]])
    X2_expected = np.array([[2], [1.5]])
    assert_allclose(knn.transform(X2), X2_expected)

Example #18

0

Show file

File: pandas_templates.py Project: felix-ha/ml-templates

def impute_knn():
    imp = KNNImputer(n_neighbors=2, weights="uniform")
    X_train = [[1, 2], [np.nan, 3], [7, 6]]
    imp.fit(X_train)
    X_test = [[np.nan, 2], [6, np.nan], [7, 6]]

    print("X_train")
    print(X_train)
    print("imputed X_test")
    print(imp.transform(X_test))

Example #19

0

Show file

File: funxions.py Project: eonslemp/ML_assisted_college_advising

def KNNimpute_DF(df):
    #filling in missing values with knn imputer
    imputer_knn = KNNImputer(n_neighbors=10)
    imputer_knn.fit(df)

    x = imputer_knn.transform(df)

    #casting the numpy array to dataframe
    df = pd.DataFrame(x)
    return df

Example #20

0

Show file

File: learners.py Project: COVIDAnalytics/covid19_hypertensive_treatments

def impute_missing(df, type='knn'):
    if type == 'knn':
        imputer = KNNImputer()
        imputer.fit(df)
    if type == 'iterative':
        imputer = IterativeImputer(random_state=0)
        imputer.fit(df)
    imputed_df = imputer.transform(df)
    df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)
    return df

Example #21

0

Show file

File: test_knn.py Project: Aathi410/Pro123

def test_knn_imputer_not_enough_valid_distances(na, weights):
    # Samples with needed feature has nan distance
    X1 = np.array([[na, 11], [na, 1], [3, na]])
    X1_imputed = np.array([[3, 11], [3, 1], [3, 6]])

    knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
    assert_allclose(knn.fit_transform(X1), X1_imputed)

    X2 = np.array([[4, na]])
    X2_imputed = np.array([[4, 6]])
    assert_allclose(knn.transform(X2), X2_imputed)

Example #22

0

Show file

File: air_quality_estimation_0_2.py Project: Lucifer8729/TeamUnderdogs_Hack36

def predict(givencity):
    givencity = city_day[(city_day.AQI.notnull())
                         & (city_day.City == givencity)]
    #tell_me_null(givencity)

    corr = givencity.corr().AQI.sort_values(ascending=False)
    related = list(corr[corr > 0.6].index)
    #print(related)

    inter = givencity.loc[:, related].interpolate(method='linear')
    givencity.loc[:, related] = inter
    knn_imputer = KNNImputer(n_neighbors=3)

    imputing_cols = [
        'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
        'Benzene', 'Toluene', 'Xylene', 'AQI', 'B_X_O3_NH3',
        'ParticulateMatters'
    ]
    # we eliminated city, date, Year_Month and AQI_Bucket because
    # they either were unique or had numerical substitutes in other fields(AQI_bucket)

    knn_imputer.fit(givencity[imputing_cols])

    imputed = knn_imputer.transform(givencity[imputing_cols])

    #givencity.loc[:, imputing_cols] = imputed

    #tell_me_null(givencity)

    givencity_aqi = givencity[['Date', 'AQI']]
    givencity_aqi.reset_index(inplace=True, drop=True)

    train_df = givencity_aqi
    train_df.rename(mapper={'Date': 'ds', 'AQI': 'y'}, axis=1, inplace=True)
    train_df

    m = Prophet(holidays_prior_scale=0,
                seasonality_prior_scale=20,
                n_changepoints=50)

    m.fit(train_df)
    future = m.make_future_dataframe(periods=365)
    #future.tail()
    forecast = m.predict(future)
    forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

    #df_cv = cross_validation(m, initial='1100 days', period='121 days', horizon = '365 days')
    #df_p = performance_metrics(df_cv)
    #print('Cross Validation accuracy:', (1 - df_p['mape'].mean())*100)

    predictions_df = pd.DataFrame(forecast, columns=['ds', 'yhat'])

    return predictions_df, m

Example #23

0

Show file

def imputation(train_data, test_data):
    '''In order to substitute the NaN values rather then delete them, a kNN imputer function is
    used to impute the missing data. This function is based on the train set and subsequently
    applied on the test set. This ensures the model is completely trained on the train set rather
    than the test set. The inputs are the trainset and the testset, the outputs are the same sets
    with imputed values.
    '''
    # impute the still existing NaN's
    imputer = KNNImputer(n_neighbors=3, weights="uniform")
    imputed_train = imputer.fit_transform(train_data)
    imputed_test = imputer.transform(test_data)

    return imputed_train, imputed_test

Example #24

0

Show file

File: utils.py Project: COVIDAnalytics/covid19_hypertensive_treatments

def remove_missing(df, missing_type=np.nan, nan_threshold=40, impute=False):
    missing_values = get_percentages(df, missing_type)
    df_features = missing_values[
        missing_values['percent_missing'] < nan_threshold].index.tolist()

    df = df[df_features]

    if impute:
        imputer = KNNImputer()
        imputer.fit(df)
        imputed_df = imputer.transform(df)
        df = pd.DataFrame(imputed_df, index=df.index, columns=df.columns)

    return df

Example #25

0

Show file

File: test_onnxrt_simple_power_transformer.py Project: sdpython/mlprodict

 def test_onnxt_knnimputer(self):
     x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13],
                            [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]],
                           dtype=numpy.float32)
     x_test = numpy.array(
         [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]],
         dtype=numpy.float32)
     kn = KNNImputer(n_neighbors=3, metric='nan_euclidean')
     kn.fit(x_train)
     model_def = to_onnx(kn, x_train)
     oinf = OnnxInference(model_def, runtime='python')
     got = oinf.run({'X': x_test})
     self.assertEqual(list(sorted(got)), ['variable'])
     self.assertEqualArray(kn.transform(x_test), got['variable'], decimal=6)

Example #26

0

Show file

File: transformers.py Project: fiksdala/insight-projects

class KNNKeepDf(BaseEstimator, TransformerMixin):
    """KNN imputer, but returns DF and retains column names"""
    def __init__(self):
        self.colnames_ = []
        self.knn = KNNImputer()

    def fit(self, X, y=None):
        self.colnames_ = X.columns
        self.knn.fit(X)
        return self

    def transform(self, X, y=None, **fit_params):
        output = pd.DataFrame(self.knn.transform(X), columns=self.colnames_)
        return output

Example #27

0

Show file

def preprocess_with_knn_imputer_minmax_scaler(
    train_data: np.ndarray,
    test_data: np.ndarray,
    n_neighbors: int = 5,
) -> Tuple[np.ndarray, np.ndarray]:
    imputer = KNNImputer(n_neighbors=n_neighbors)
    train_data_without_nans = imputer.fit_transform(train_data)
    test_data_without_nans = imputer.transform(test_data)

    min_max_scaler = MinMaxScaler()
    train_data_without_nans_scaled = min_max_scaler.fit_transform(
        train_data_without_nans)
    test_data_without_nans_scaled = min_max_scaler.transform(
        test_data_without_nans)

    return train_data_without_nans_scaled, test_data_without_nans_scaled

Example #28

0

Show file

File: utils_MX.py Project: aleistermi/INEGI_wealth_index

def impute_values(df, imp_strategy, neighbors, numeric_vars):

    X = convert_to_numeric(df, numeric_vars)
    X = df[numeric_vars].to_numpy()
    other_vars = list(set(df.columns) - set(numeric_vars) )
    X_strings = df[other_vars].reset_index(drop=True)
    if imp_strategy == "knn":
        imputer = KNNImputer(n_neighbors = neighbors) #weights = weight_type
        imputed = imputer.fit_transform(X) # This is very costly
# from here https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
# https://impyute.readthedocs.io/en/master/api/cross_sectional_imputation.html
#         imputed = fast_knn(X, k= neighbors)
    else:
        imputer = SimpleImputer(missing_values = np.nan, strategy = imp_strategy)
        imputer.fit(X)
        imputed = imputer.transform(X)
    X_imputed = pd.DataFrame.from_records(imputed, columns = numeric_vars)
    rv = X_strings.join(X_imputed)
    return rv

Example #29

0

Show file

File: dataPreparation.py Project: ceciliacal/MOBD_project

def naKNN(train_x, test_x):
    """
    Sostituisce i valori mancanti nel training set e nel test set con KNNImputer().
    :param train_x: training set
    :param test_x: test set
    :return: None
    """
    getNaCount(train_x)  # calcola il numero di NaN per il training set
    imputer = KNNImputer(n_neighbors=3)

    imputed_train = imputer.fit_transform(train_x.data)
    train_x.data = pd.DataFrame(imputed_train, columns=train_x.data.columns)
    save_object(
        imputer, 'imputer.pkl'
    )  # salva imputer nel file 'imputer.pkl' (serve successivamente per il test finale)

    if test_x is not None:
        imputed_test = imputer.transform(test_x.data)
        test_x.data = pd.DataFrame(imputed_test, columns=test_x.data.columns)

Example #30

0

Show file

def cv_preprocessing(X_train, X_test=None, random_state=None):
    variables_path = r"variables.json"
    with open(variables_path) as f:
        variables = json.load(f)
        t1_features, cogni = variables['t1_features'], variables['cogni']
        pcl = variables['questionnaires']['PCL'][:17]

    mice = KNNImputer()
    columns = X_train.columns
    X_train = pd.DataFrame(mice.fit_transform(X_train), columns=columns)

    #X_train = stds(X_train)
    #X_train = stats(X_train)
    #X_train = removal_correlated(X_train)
    # ss = StandardScaler()
    # X_train = ss.fit_transform(X_train)
    # X_train = pd.DataFrame(ss.fit_transform(X_train), columns=columns)
    if X_test is not None:
        X_test = pd.DataFrame(mice.transform(X_test), columns=columns)
        #X_test = stds(X_test)
        #X_test = stats(X_train, X_test)
        #_, X_test = removal_correlated(X_train, X_test)
        # X_test = ss.transform(X_test)
        # X_test = pd.DataFrame(ss.transform(X_test), columns=columns)

        X_train, X_test = outliers(
            X_train,
            X_test,
            features=[f"T1q5.{i}" for i in range(1, 10)],
            name='phq9')
        #X_train, X_test = outliers(X_train, X_test, features=pcl, name='PCL')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=cogni,
                                   name='cogni')
        X_train, X_test = outliers(X_train,
                                   X_test,
                                   features=t1_features,
                                   name='t1')

        return X_train, X_test
    else:
        return X_train