Ejemplo n.º 1
0
    def test_RSI_wrong_type(self):
        """
        Negative test

        data: array (unsupported type)

        Checks that the random_sample_imputation raises a TypeError if the
        data is passed as an array.
        """
        # 1. Arrange
        data = [2, 4, np.nan, 1]
        # 2. Act & 3. Assert
        with self.assertRaises(TypeError):
            random_sample_imputation(data)
Ejemplo n.º 2
0
    def test_RSI_series_inplace(self):
        """
        Positive test

        data: Correct Series (example series)

        Checks that random_sample_imputation removes 3 NA values from the
        series.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act
        random_sample_imputation(ser, inplace=True)
        # 3. Assert
        self.assertEqual(ser.isna().sum(), 0)
Ejemplo n.º 3
0
    def test_RSI_df_inplace(self):
        """
        Positive test

        data: Correct dataframe (divcols)

        Checks that random_sample_imputation removes 8 NA values from the
        dataframe.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        random_sample_imputation(df, inplace=True)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 10)
Ejemplo n.º 4
0
    def test_NOCB_df_inplace_wrong_column(self):
        """
        Negative test

        data: Correct dataframe (divcols)
        columns: ['f', 'g', 'z'] ('z' doesn't exist in the data)

        Checks that the random_sample_imputation raises a ValueError if one of
        the specified columns doesn't exist in the data.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            random_sample_imputation(df, columns=['f', 'g', 'z'], inplace=True)
Ejemplo n.º 5
0
    def test_RSI_col_for_series(self):
        """
        Negative test

        data: Correct series (example_series)
        columns: ['a'] (series can't have columns)

        Checks that the function raises a ValueError if a column is passed
        for a series.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            random_sample_imputation(ser, columns=['a'])
Ejemplo n.º 6
0
    def test_RSI_df_inplace_columns(self):
        """
        Positive test

        data: Correct dataframe (divcols)
        columns: ['f', 'g']

        Checks that random_sample_imputation removes 4 NA values from the
        specified columns.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        random_sample_imputation(df, columns=['f', 'g'], inplace=True)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 14)
Ejemplo n.º 7
0
def mice_one_imputation(data):
    """Auxiliary function that performs one MICE imputation, choosing the
    order in which the columns are imputed at random.

    :param data: The data on which to perform the imputation.
    :type data: pandas.DataFrame
    :return: The dataframe with one MICE imputation performed.
    :rtype: pandas.DataFrame
    """
    # This auxiliary function always returns a copy:
    res = data.copy()
    # Save the mask of missing values:
    na_mask = pd.isna(data)
    # Compute the list of columns with missing values
    columns_with_na = []
    for column in data.columns:
        if data[column].isna().any():
            columns_with_na.append(column)
    # Shuffle the list of columns to impute:
    shuffle(columns_with_na)
    # Impute with mean substitution:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            mean_substitution(res, columns=[column], inplace=True)
        else:
            random_sample_imputation(res, columns=[column], inplace=True)
    # Compute which columns are numeric in order to use them as predictors:
    numerics = [col for col in data.columns if is_numeric_dtype(data[col])]
    # Impute each column:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            res.loc[na_mask[column], column] = np.nan
            linear_regression(res, column, predictors=numerics, inplace=True)
        else:
            res.loc[na_mask[column], column] = None
            logistic_regression(res, column, inplace=True)
    return res
Ejemplo n.º 8
0
    def test_RSI_series_returning(self):
        """
        Positive test

        data: Correct Series (example series)

        Checks that the original series remains unmodified and that the
        returned series contains no NA values, 3 less than the original.
        """
        # 1. Arrange
        ser = generate_example_series()
        # 2. Act
        ser2 = random_sample_imputation(ser)
        # 3. Assert
        self.assertEqual(ser.isna().sum(), 3)
        self.assertEqual(ser2.isna().sum(), 0)
Ejemplo n.º 9
0
    def test_RSI_df_returning(self):
        """
        Positive test

        data: Correct dataframe (divcols)

        Checks that the original dataframe remains unmodified and that the
        returned dataframe contains 10 NA values, 8 less than the original.
        """
        # 1. Arrange
        df = generate_example_df_divcols()
        # 2. Act
        df2 = random_sample_imputation(df)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 18)
        self.assertEqual(df2.isna().sum().sum(), 10)