Ejemplo n.º 1
0
    def test_cleancategorical_replacemissingnewcategory_noparams(self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category()
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate, [[1, "Green", 2], [1, "Other", 1], [-1, "Unknown", -1]])
Ejemplo n.º 2
0
    def test_cleancategorical_replacemissingnewcategory_dict(self):

        missing_data = [[1, "Green", 2], [1, np.nan, 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        category_dict_mapping = {"col1": 2, "col2": "Blue", "col3": 4}

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(col_mapping=category_dict_mapping)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            [[1.0, "Green", 2.0], [1.0, "Blue", 1.0], [2.0, "Blue", 4.0]])
Ejemplo n.º 3
0
    def test_cleancategorical_replacemissingnewcategory_list_constantnotnone(
            self):

        missing_data = np.array([(1, "Green", 2), (1, "Other", 1),
                                 (None, None, None)])

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        list_col = ["col1", "col3"]

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(list_of_cols=list_col,
                                           new_category=0)
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            np.array([(1, "Green", 2), (1, "Other", 1),
                      (0, None, 0)]).tolist(),
        )
Ejemplo n.º 4
0
    def test_cleancategorical_replacemissingnewcategory_list_constantisnone(
            self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, None, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)
        list_col = ["col1", "col2"]

        clean = Clean(x_train=data, test_split_percentage=0.5, split=False)
        clean.replace_missing_new_category(list_of_cols=list_col)

        # Replacing NaNs with strings for validations as regular assert does == and to compare NaNs you need `is`
        clean._data_properties.x_train = clean.x_train.fillna("NaN was here")
        validate = clean.x_train.values.tolist()

        self.assertListEqual(
            validate,
            [[1, "Green", 2.0], [1, "Other", 1.0],
             [-1, "Unknown", "NaN was here"]],
        )
Ejemplo n.º 5
0
    def test_report_cleaning_new_category(self):

        missing_data = [[1.0, "Green", 2], [1.0, "Other", 1],
                        [np.nan, np.nan, np.nan]]

        columns = ["col1", "col2", "col3"]
        data = pd.DataFrame(missing_data, columns=columns)

        clean = Clean(x_train=data,
                      test_split_percentage=0.5,
                      split=False,
                      report_name="test")
        clean_data = clean.replace_missing_new_category()

        with open(clean._data_properties.report.filename) as f:
            content = f.read()
        validate = "col1" in content and "col2" in content and "col3" in content

        os.remove(clean._data_properties.report.filename)

        self.assertTrue(validate)