Ejemplo n.º 1
0
    def test_LR_wrong_type(self):
        """
        Negative test

        data: array (unsupported type)

        Checks that the function raises a TypeError if the data is passed as
        an array.
        """
        # 1. Arrange
        data = [2, 4, np.nan, 1]
        # 2. Act & 3. Assert
        with self.assertRaises(TypeError):
            linear_regression(data)
Ejemplo n.º 2
0
    def test_LR_wrong_regressions(self):
        """
        Negative test

        data: Correct data frame (sales)
        regressions: 'z' (not a valid value)

        Checks that the function raises a ValueError if the value passed for
        the parameter regressions is not valid.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            linear_regression(df, 'sales', ['advertising', 'year'], 'z')
Ejemplo n.º 3
0
    def test_LR_wrong_predictor(self):
        """
        Negative test

        data: Correct data frame (sales)
        predictors: ['advertising', 'z'] ('z' is not a column of sales)

        Checks that the function raises a ValueError if one of the column
        specified as the predictor variables doesn't exist in the data.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            linear_regression(df, 'sales', ['advertising', 'z'])
Ejemplo n.º 4
0
    def test_LR_wrong_dependent(self):
        """
        Negative test

        data: Correct data frame (sales)
        dependent: 'z' (not a column of sales)

        Checks that the function raises a ValueError if the column specified as
        the dependent variable doesn't exist in the data.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act & 3. Assert
        with self.assertRaises(ValueError):
            linear_regression(df, 'z', ['advertising', 'year'])
Ejemplo n.º 5
0
    def test_LR_inplace(self):
        """
        Positive test

        data: Correct data frame (sales)

        The data frame sales contains 4 NA values in the column 'sales'.
        linear_regression() should impute 3 of them.

        Checks that the data frame contains 1 NA value in the column 'sales'
        after the operation.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act
        linear_regression(df, 'sales', ['advertising', 'year'], inplace=True)
        # 3. Assert
        self.assertEqual(df['sales'].isna().sum(), 1)
Ejemplo n.º 6
0
def mice_one_imputation(data):
    """Auxiliary function that performs one MICE imputation, choosing the
    order in which the columns are imputed at random.

    :param data: The data on which to perform the imputation.
    :type data: pandas.DataFrame
    :return: The dataframe with one MICE imputation performed.
    :rtype: pandas.DataFrame
    """
    # This auxiliary function always returns a copy:
    res = data.copy()
    # Save the mask of missing values:
    na_mask = pd.isna(data)
    # Compute the list of columns with missing values
    columns_with_na = []
    for column in data.columns:
        if data[column].isna().any():
            columns_with_na.append(column)
    # Shuffle the list of columns to impute:
    shuffle(columns_with_na)
    # Impute with mean substitution:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            mean_substitution(res, columns=[column], inplace=True)
        else:
            random_sample_imputation(res, columns=[column], inplace=True)
    # Compute which columns are numeric in order to use them as predictors:
    numerics = [col for col in data.columns if is_numeric_dtype(data[col])]
    # Impute each column:
    for column in columns_with_na:
        if is_numeric_dtype(data[column]):
            res.loc[na_mask[column], column] = np.nan
            linear_regression(res, column, predictors=numerics, inplace=True)
        else:
            res.loc[na_mask[column], column] = None
            logistic_regression(res, column, inplace=True)
    return res
Ejemplo n.º 7
0
    def test_LR_returning(self):
        """
        Positive test

        data: Correct data frame (sales)

        The data frame sales contains 4 NA values in the column 'sales'.
        linear_regression() should impute 3 of them.

        Checks that the original data frame remains unmodified and that the
        returned series contains 1 NA value in the column 'sales'.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act
        df2 = linear_regression(df, 'sales', ['advertising', 'year'])
        # 3. Assert
        self.assertEqual(df['sales'].isna().sum(), 4)
        self.assertEqual(df2['sales'].isna().sum(), 1)
Ejemplo n.º 8
0
    def test_LR_all_columns(self):
        """
        Positive test

        data: Correct data frame (sales)
        dependent: None

        The data frame sales contains 8 NA values.
        linear_regression() should impute 5 of them.

        Checks that the original data frame remains unmodified and that the
        returned series contains 3 NA values.
        """
        # 1. Arrange
        df = generate_df_sales()
        # 2. Act
        df2 = linear_regression(df)
        # 3. Assert
        self.assertEqual(df.isna().sum().sum(), 8)
        self.assertEqual(df2.isna().sum().sum(), 3)