def prepare(self, data):
        df = super(ForestFiresPreparerETL, self).prepare(data)

        self.input_validator.validate(df)

        _rows, cols = df.shape

        # Target Transformations
        df['area_log'] = np.log1p(df['area'])

        # Feature Transformations
        df['FFMC_log'] = np.log1p(df['FFMC'])
        df['ISI_log'] = np.log1p(df['ISI'])
        df['rain_log'] = np.log1p(df['rain'])
        df['rain_cat'] = (df['rain'] > 0).astype(np.uint8)

        df = dummify(df, 'month', [
            'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep',
            'oct', 'nov', 'dec'
        ])
        df = dummify(df, 'day',
                     ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])

        self.output_validator.validate(df)

        return df
Esempio n. 2
0
    def prepare(self, data):
        df = super(AbalonePreparerETL, self).prepare(data)
        self.input_validator.validate(df)

        df['age'] = df['rings'] + 1.5
        df = dummify(df, 'sex', ['M', 'F', 'I'])

        return df
    def prepare(self, data):
        df = super(ForestFiresPreparer, self).prepare(data)

        df['rain_cat'] = (df['rain'] > 0).astype(np.uint8)
        df['ISI_log'] = np.log1p(df['ISI'])

        df = dummify(df, 'month', [
            'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep',
            'oct', 'nov', 'dec'
        ])
        df = dummify(df, 'day',
                     ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])

        selected_features = [
            'X',
            'Y',
            'FFMC',
            'DMC',
            'DC',
            'ISI_log',
            'temp',
            'RH',
            'wind',
            'rain_cat',
            'apr',
            'aug',
            'dec',
            'feb',
            'jan',
            'jun',
            'mar',
            'may',
            'nov',
            'oct',
            'sep',
            'fri',
            'mon',
            'sat',
            'sun',
            'thu',
        ]

        return df[selected_features].copy()
 def test_includes_dummy_na(self):
     df = pd.DataFrame(['a', 'b', 'a', None], columns=['category'])
     dummified = dummify(df,
                         'category',
                         categories=['a', 'b', 'c'],
                         dummy_na=True)
     assert 'a' in dummified, "New column should be added"
     assert 'b' in dummified, "New column should be added"
     assert 'c' in dummified, "New column should be added"
     assert None in dummified, "New column should be added"
Esempio n. 5
0
    def prepare(self, data):
        df = super(AbalonePreparer, self).prepare(data)

        df = dummify(df, 'sex', ['M', 'F', 'I'])

        selected_features = [
            'length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
            'viscera_weight', 'shell_weight', 'M', 'F'
        ]

        return df[selected_features].copy()
 def test_includes_columns_explicitly(self):
     dummified = dummify(df, 'category', categories=['a', 'b', 'c'])
     assert 'a' in dummified, "New column should be added"
     assert 'b' in dummified, "New column should be added"
     assert 'c' in dummified, "New column should be added"
 def test_dummy_values(self):
     dummified = dummify(df, 'category', categories=['a', 'b'])
     dummy_values = [['a', 1, 0], ['b', 0, 1], ['a', 1, 0]]
     assert np.array_equal(dummified.values.tolist(), dummy_values)
 def test_new_number_of_columns(self):
     dummified = dummify(df, 'category', categories=['a', 'b'])
     assert len(
         dummified.columns) == 3, "New columns should be concatenated"
 def test_raise_datatype_error(self):
     with pytest.raises(TypeError, match="df must be a DataFrame"):
         dummify(None, 'category', categories=['a', 'b'])