def test_reshape_arr_1d_type(arr): """ Tests the function `cosmo_utils.utils.gen_utils.reshape_arr_1d` for input types. Parameters ----------- arr : `numpy.ndarray` or array-like Array to be converted into 1-dimensional array. """ ## Testing input types with pytest.raises(TypeError): gen_utils.reshape_arr_1d(arr)
def test_reshape_arr_1d_shape(arr_shape, expected_shape): """ Tests the function `cosmo_utils.utils.gen_utils.reshape_arr_1d` for input types. Parameters ----------- arr_shape : `tuple` or `int` Shape of the array to create. expected_shape : `int` or `tuple` Expected shape for the array """ # Creating array arr = np.random.random(arr_shape) # Checking result with function arr_out = gen_utils.reshape_arr_1d(arr) # Checking shape assert(arr_out.shape == expected_shape)
def data_preprocessing(feat_arr, pre_opt='min_max', reshape=False): """ Preprocess the data used, in order to clean and make the data more suitable for the machine learning algorithms Parameters ----------- feat_arr : `numpy.ndarray`, `list`, `pandas.DataFrame` Array of feature values. This array is used for training a ML algorithm. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses `~sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` reshape : `bool`, optional If True, it reshapes `feat_arr` into a 1d array if its shapes is equal to (ncols, 1), where `ncols` is the number of columns. This variable is set to `False` by default. Returns ----------- feat_arr_scaled : `numpy.ndarray` Rescaled version of `feat_arr` based on the choice of `pre_opt`. Notes ----------- For more information on how to pre-process your data, see `http://scikit-learn.org/stable/modules/preprocessing.html`_. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `feat_arr` feat_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) ## ## Reshaping `feat_arr` if reshape: feat_arr = gu.reshape_arr_1d(feat_arr) ## ## Scaling `feat_arr` if (pre_opt == 'min_max'): # Scaler scaler = skpre.MinMaxScaler(feature_range=(0, 1)) # Rescaling feat_arr_scaled = scaler.fit_transform(feat_arr) ## Standardize Data if pre_opt == 'standard': # Scaler scaler = skpre.StandardScaler().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## Normalize Data if pre_opt == 'normalize': # Scaler scaler = skpre.Normalizer().fit(feat_arr) # Rescaling feat_arr_scaled = scaler.transform(feat_arr) ## No Preprocessing if pre_opt == 'no': feat_arr_scaled = feat_arr return feat_arr_scaled
def train_test_dataset(pred_arr, feat_arr, pre_opt='min_max', shuffle_opt=True, random_state=0, test_size=0.25, reshape=False, return_idx=False): """ Function to create the training and testing datasets for a given set of features array and predicted array. Parameters ----------- pred_arr : `pandas.DataFrame` `numpy.ndarray` or array-like, shape (n_samples, n_outcomes) Array consisting of the `predicted values`. The dimensions of `pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the number of observations, and `n_outcomes` the number of predicted outcomes. feat_arr : `numpy.ndarray`, `pandas.DataFrame` or array-like, shape (n_samples, n_features) Array consisting of the `predicted values`. The dimensions of `feat_arr` are `n_samples` by `n_features`, where `n_samples` is the number of observations, and `n_features` the number of features used. pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional Type of preprocessing to do on `feat_arr`. Options: - 'min_max' : Turns `feat_arr` to values between (0,1) - 'standard' : Uses `sklearn.preprocessing.StandardScaler` method - 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method - 'no' : No preprocessing on `feat_arr` shuffle_opt : `bool`, optional If True, the data is shuffled before splitting into testing and training datasets. This variable is set to True by default. random_state : int, optional Random state number used for when splitting into training and testing datasets. If set, it will always have the same seed `random_state`. This variable is set to `0` by default. test_size : float, optional Percentage of the catalogue that represents the `test` size of the testing dataset. This variable must be between (0,1). This variable is set to `0.25` by default. reshape : `bool`, optional If True, it reshapes `feat_arr` into a 1d array if its shapes is equal to (ncols, 1), where `ncols` is the number of columns. This variable is set to `False` by default. return_idx : `bool`, optional If `True`, it returns the indices of the `training` and `testing` datasets. This variable is set to `False` by default. Returns ----------- train_dict : `dict` Dictionary containing the `training` data from the catalogue. test_dict : `dict` Dictionary containing the `testing` data from the catalogue. See also ----------- data_preprocessing : Function to preprocess a dataset. """ file_msg = fd.Program_Msg(__file__) ## Checking input parameters # `pred_arr` pred_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(pred_arr, pred_arr_type_valid)): msg = '{0} `pred_arr` ({1}) is not a valid input type'.format( file_msg, type(pred_arr)) raise LSSUtils_Error(msg) # `feat_arr` feat_arr_type_valid = (list, np.ndarray, pd.DataFrame) if not (isinstance(feat_arr, feat_arr_type_valid)): msg = '{0} `feat_arr` ({1}) is not a valid input type'.format( file_msg, type(feat_arr)) raise LSSUtils_Error(msg) # `pre_opt` pre_opt_valid = ['min_max', 'standard', 'normalize', 'no'] if not (pre_opt in pre_opt_valid): msg = '{0} `pre_opt` ({1}) is not a valid input'.format( file_msg, pre_opt) raise LSSUtils_Error(msg) # `shuffle_opt` shuffle_opt_type_valid = (bool) if not (isinstance(shuffle_opt, shuffle_opt_type_valid)): msg = '{0} `shuffle_opt` ({1}) is not a valid input type'.format( file_msg, type(shuffle_opt)) raise LSSUtils_Error(msg) # `random_state` random_state_type_valid = (int) if not (isinstance(random_state, random_state_type_valid)): msg = '{0} `random_state` ({1}) is not a valid input'.format( file_msg, random_state) raise LSSUtils_Error(msg) # `test_size` if not ((test_size > 0) and (test_size < 1.)): msg = '{0} `test_size` ({1}) must be in range (0,1)'.format( file_msg, test_size) raise LSSUtils_Error(msg) ## ## Checking indices of `pred_arr` and `feat_arr` if return_idx: # If object is a DataFrame if (isinstance(pred_arr, pd.DataFrame) and isinstance(feat_arr, pd.DataFrame)): pred_arr_idx = pred_arr.index.values feat_arr_idx = feat_arr.index.values else: pred_arr_idx = np.arange(len(pred_arr)) feat_arr_idx = np.arange(len(feat_arr)) # Reshaping if necessary if reshape: pred_arr_idx = gu.reshape_arr_1d(pred_arr_idx) feat_arr_idx = gu.reshape_arr_1d(feat_arr_idx) ## ## Checking dimensions of `pred_arr` and `feat_arr` pred_arr = np.asarray(pred_arr) feat_arr = np.asarray(feat_arr) # Dimensions if reshape: pred_arr = gu.reshape_arr_1d(pred_arr) feat_arr = gu.reshape_arr_1d(feat_arr) # Shape if (len(pred_arr) != len(feat_arr)): msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must ' msg += 'have the same length' msg = msg.format(file_msg, len(pred_arr), len(feat_arr)) raise LSSUtils_Error(msg) ## ## Rescaling Dataset feat_arr_scaled = data_preprocessing(feat_arr, pre_opt=pre_opt, reshape=reshape) ## ## Splitting into `Training` and `Testing` datasets. # Scaled (X_train, X_test, Y_train, Y_test) = skms.train_test_split(feat_arr_scaled, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) # Not-scaled (X_train_ns, X_test_ns, Y_train_ns, Y_test_ns) = skms.train_test_split(feat_arr, pred_arr, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) # Returning indices if necessary if return_idx: # Splitting to `training` and `testing` (X_train_idx, X_test_idx, Y_train_idx, Y_test_idx) = skms.train_test_split(feat_arr_idx, pred_arr_idx, test_size=test_size, shuffle=shuffle_opt, random_state=random_state) if not (np.array_equal(X_train_idx, Y_train_idx) and np.array_equal(X_test_idx, Y_test_idx)): msg = '{0} Index arrays are not equal to each other!' raise LSSUtils_Error(msg) ## ## Assigning `training` and `testing` datasets to dictionaries # Saving indices if necessary if return_idx: # Adding 'indices' to dictionaries train_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns, 'train_idx': X_train_idx } test_dict = { 'X_test': X_test, 'Y_test': Y_test, 'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns, 'test_idx': X_test_idx } else: train_dict = { 'X_train': X_train, 'Y_train': Y_train, 'X_train_ns': X_train_ns, 'Y_train_ns': Y_train_ns } test_dict = { 'X_test': X_test, 'Y_test': Y_test, 'X_test_ns': X_test_ns, 'Y_test_ns': Y_test_ns } return train_dict, test_dict