Example #1
0
def test_reshape_arr_1d_type(arr):
    """
    Tests the function `cosmo_utils.utils.gen_utils.reshape_arr_1d`
    for input types.

    Parameters
    -----------
    arr : `numpy.ndarray` or array-like
        Array to be converted into 1-dimensional array.
    """
    ## Testing input types
    with pytest.raises(TypeError):
        gen_utils.reshape_arr_1d(arr)
Example #2
0
def test_reshape_arr_1d_shape(arr_shape, expected_shape):
    """
    Tests the function `cosmo_utils.utils.gen_utils.reshape_arr_1d`
    for input types.

    Parameters
    -----------
    arr_shape : `tuple` or `int`
        Shape of the array to create.

    expected_shape : `int` or `tuple`
        Expected shape for the array
    """
    # Creating array
    arr = np.random.random(arr_shape)
    # Checking result with function
    arr_out = gen_utils.reshape_arr_1d(arr)
    # Checking shape
    assert(arr_out.shape == expected_shape)
Example #3
0
def data_preprocessing(feat_arr, pre_opt='min_max', reshape=False):
    """
    Preprocess the data used, in order to clean and make the data more
    suitable for the machine learning algorithms

    Parameters
    -----------
    feat_arr : `numpy.ndarray`, `list`, `pandas.DataFrame`
        Array of feature values. This array is used for training a
        ML algorithm.

    pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
        Type of preprocessing to do on `feat_arr`.

        Options:
            - 'min_max' : Turns `feat_arr` to values between (0,1)
            - 'standard' : Uses `~sklearn.preprocessing.StandardScaler` method
            - 'normalize' : Uses the `~sklearn.preprocessing.Normalizer` method
            - 'no' : No preprocessing on `feat_arr`

    reshape : `bool`, optional
        If True, it reshapes `feat_arr` into a 1d array if its shapes is
        equal to (ncols, 1), where `ncols` is the number of columns.
        This variable is set to `False` by default.

    Returns
    -----------
    feat_arr_scaled : `numpy.ndarray`
        Rescaled version of `feat_arr` based on the choice of `pre_opt`.

    Notes
    -----------
    For more information on how to pre-process your data, see
    `http://scikit-learn.org/stable/modules/preprocessing.html`_.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    # `feat_arr`
    feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(feat_arr, feat_arr_type_valid)):
        msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
            file_msg, type(feat_arr))
        raise LSSUtils_Error(msg)
    # `pre_opt`
    pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
    if not (pre_opt in pre_opt_valid):
        msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
            file_msg, pre_opt)
        raise LSSUtils_Error(msg)
    ##
    ## Reshaping `feat_arr`
    if reshape:
        feat_arr = gu.reshape_arr_1d(feat_arr)
    ##
    ## Scaling `feat_arr`
    if (pre_opt == 'min_max'):
        # Scaler
        scaler = skpre.MinMaxScaler(feature_range=(0, 1))
        # Rescaling
        feat_arr_scaled = scaler.fit_transform(feat_arr)
    ## Standardize Data
    if pre_opt == 'standard':
        # Scaler
        scaler = skpre.StandardScaler().fit(feat_arr)
        # Rescaling
        feat_arr_scaled = scaler.transform(feat_arr)
    ## Normalize Data
    if pre_opt == 'normalize':
        # Scaler
        scaler = skpre.Normalizer().fit(feat_arr)
        # Rescaling
        feat_arr_scaled = scaler.transform(feat_arr)
    ## No Preprocessing
    if pre_opt == 'no':
        feat_arr_scaled = feat_arr

    return feat_arr_scaled
Example #4
0
def train_test_dataset(pred_arr,
                       feat_arr,
                       pre_opt='min_max',
                       shuffle_opt=True,
                       random_state=0,
                       test_size=0.25,
                       reshape=False,
                       return_idx=False):
    """
    Function to create the training and testing datasets for a given set
    of features array and predicted array.

    Parameters
    -----------
    pred_arr : `pandas.DataFrame` `numpy.ndarray` or array-like, shape (n_samples, n_outcomes)
        Array consisting of the `predicted values`. The dimensions of
        `pred_arr` are `n_samples` by `n_outcomes`, where `n_samples` is the
        number of observations, and `n_outcomes` the number of predicted
        outcomes.

    feat_arr : `numpy.ndarray`, `pandas.DataFrame` or array-like, shape (n_samples, n_features)
        Array consisting of the `predicted values`. The dimensions of
        `feat_arr` are `n_samples` by `n_features`, where `n_samples`
        is the number of observations, and `n_features` the number of
        features used.

    pre_opt : {'min_max', 'standard', 'normalize', 'no'} `str`, optional
        Type of preprocessing to do on `feat_arr`.

        Options:
            - 'min_max' : Turns `feat_arr` to values between (0,1)
            - 'standard' : Uses `sklearn.preprocessing.StandardScaler` method
            - 'normalize' : Uses the `sklearn.preprocessing.Normalizer` method
            - 'no' : No preprocessing on `feat_arr`

    shuffle_opt : `bool`, optional
        If True, the data is shuffled before splitting into testing and
        training datasets. This variable is set to True by default.

    random_state : int, optional
        Random state number used for when splitting into training and
        testing datasets. If set, it will always have the same seed
        `random_state`. This variable is set to `0` by default.

    test_size : float, optional
        Percentage of the catalogue that represents the `test` size of
        the testing dataset. This variable must be between (0,1).
        This variable is set to `0.25` by default.

    reshape : `bool`, optional
        If True, it reshapes `feat_arr` into a 1d array if its shapes is
        equal to (ncols, 1), where `ncols` is the number of columns.
        This variable is set to `False` by default.

    return_idx : `bool`, optional
        If `True`, it returns the indices of the `training` and `testing`
        datasets. This variable is set to `False` by default.

    Returns
    -----------
    train_dict : `dict`
        Dictionary containing the `training` data from the catalogue.

    test_dict : `dict`
        Dictionary containing the `testing` data from the catalogue.

    See also
    -----------
    data_preprocessing : Function to preprocess a dataset.
    """
    file_msg = fd.Program_Msg(__file__)
    ## Checking input parameters
    # `pred_arr`
    pred_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(pred_arr, pred_arr_type_valid)):
        msg = '{0} `pred_arr` ({1}) is not a valid input type'.format(
            file_msg, type(pred_arr))
        raise LSSUtils_Error(msg)
    # `feat_arr`
    feat_arr_type_valid = (list, np.ndarray, pd.DataFrame)
    if not (isinstance(feat_arr, feat_arr_type_valid)):
        msg = '{0} `feat_arr` ({1}) is not a valid input type'.format(
            file_msg, type(feat_arr))
        raise LSSUtils_Error(msg)
    # `pre_opt`
    pre_opt_valid = ['min_max', 'standard', 'normalize', 'no']
    if not (pre_opt in pre_opt_valid):
        msg = '{0} `pre_opt` ({1}) is not a valid input'.format(
            file_msg, pre_opt)
        raise LSSUtils_Error(msg)
    # `shuffle_opt`
    shuffle_opt_type_valid = (bool)
    if not (isinstance(shuffle_opt, shuffle_opt_type_valid)):
        msg = '{0} `shuffle_opt` ({1}) is not a valid input type'.format(
            file_msg, type(shuffle_opt))
        raise LSSUtils_Error(msg)
    # `random_state`
    random_state_type_valid = (int)
    if not (isinstance(random_state, random_state_type_valid)):
        msg = '{0} `random_state` ({1}) is not a valid input'.format(
            file_msg, random_state)
        raise LSSUtils_Error(msg)
    # `test_size`
    if not ((test_size > 0) and (test_size < 1.)):
        msg = '{0} `test_size` ({1}) must be in range (0,1)'.format(
            file_msg, test_size)
        raise LSSUtils_Error(msg)
    ##
    ## Checking indices of `pred_arr` and `feat_arr`
    if return_idx:
        # If object is a DataFrame
        if (isinstance(pred_arr, pd.DataFrame)
                and isinstance(feat_arr, pd.DataFrame)):
            pred_arr_idx = pred_arr.index.values
            feat_arr_idx = feat_arr.index.values
        else:
            pred_arr_idx = np.arange(len(pred_arr))
            feat_arr_idx = np.arange(len(feat_arr))
        # Reshaping if necessary
        if reshape:
            pred_arr_idx = gu.reshape_arr_1d(pred_arr_idx)
            feat_arr_idx = gu.reshape_arr_1d(feat_arr_idx)
    ##
    ## Checking dimensions of `pred_arr` and `feat_arr`
    pred_arr = np.asarray(pred_arr)
    feat_arr = np.asarray(feat_arr)
    # Dimensions
    if reshape:
        pred_arr = gu.reshape_arr_1d(pred_arr)
        feat_arr = gu.reshape_arr_1d(feat_arr)
    # Shape
    if (len(pred_arr) != len(feat_arr)):
        msg = '{0} The shape of `pred_arr` ({1}) and `feat_arr` ({2}) must '
        msg += 'have the same length'
        msg = msg.format(file_msg, len(pred_arr), len(feat_arr))
        raise LSSUtils_Error(msg)
    ##
    ## Rescaling Dataset
    feat_arr_scaled = data_preprocessing(feat_arr,
                                         pre_opt=pre_opt,
                                         reshape=reshape)
    ##
    ## Splitting into `Training` and `Testing` datasets.
    # Scaled
    (X_train, X_test, Y_train,
     Y_test) = skms.train_test_split(feat_arr_scaled,
                                     pred_arr,
                                     test_size=test_size,
                                     shuffle=shuffle_opt,
                                     random_state=random_state)
    # Not-scaled
    (X_train_ns, X_test_ns, Y_train_ns,
     Y_test_ns) = skms.train_test_split(feat_arr,
                                        pred_arr,
                                        test_size=test_size,
                                        shuffle=shuffle_opt,
                                        random_state=random_state)
    # Returning indices if necessary
    if return_idx:
        # Splitting to `training` and `testing`
        (X_train_idx, X_test_idx, Y_train_idx,
         Y_test_idx) = skms.train_test_split(feat_arr_idx,
                                             pred_arr_idx,
                                             test_size=test_size,
                                             shuffle=shuffle_opt,
                                             random_state=random_state)
        if not (np.array_equal(X_train_idx, Y_train_idx)
                and np.array_equal(X_test_idx, Y_test_idx)):
            msg = '{0} Index arrays are not equal to each other!'
            raise LSSUtils_Error(msg)
    ##
    ## Assigning `training` and `testing` datasets to dictionaries
    # Saving indices if necessary
    if return_idx:
        # Adding 'indices' to dictionaries
        train_dict = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_train_ns': X_train_ns,
            'Y_train_ns': Y_train_ns,
            'train_idx': X_train_idx
        }
        test_dict = {
            'X_test': X_test,
            'Y_test': Y_test,
            'X_test_ns': X_test_ns,
            'Y_test_ns': Y_test_ns,
            'test_idx': X_test_idx
        }
    else:
        train_dict = {
            'X_train': X_train,
            'Y_train': Y_train,
            'X_train_ns': X_train_ns,
            'Y_train_ns': Y_train_ns
        }
        test_dict = {
            'X_test': X_test,
            'Y_test': Y_test,
            'X_test_ns': X_test_ns,
            'Y_test_ns': Y_test_ns
        }

    return train_dict, test_dict