def test_transform_unseen(self, X, y, handle_unseen, expected):
     enc = WeightOfEvidenceEncoder(cols=['cat'], handle_unseen=handle_unseen)
     X = pd.DataFrame(X, columns=['cat'])
     enc.fit(X, pd.Series(y))
     X.iloc[0, 0] = 'foo'
     result = enc.transform(X)
     assert_array_almost_equal(result, pd.DataFrame(expected), decimal=3)
 def test_encode_nans(self, X, y, expected, columns):
     enc = WeightOfEvidenceEncoder(cols=['cat'])
     result = enc.fit_transform(pd.DataFrame(X, columns=['cat']), pd.Series(y))
     assert_array_almost_equal(result, pd.DataFrame(expected), decimal=3)
     ok_('cat' in enc._mapping)
     ok_(isinstance(enc._mapping['cat'], pd.DataFrame))
     eq_(enc._mapping['cat'].index[0], -99999)
     assert_array_equal(enc._mapping['cat'].index[1:], columns)
     assert_array_equal(enc._mapping['cat'].columns, ['pos', 'count', 'neg', 'value'])
Ejemplo n.º 3
0
 def woe_encoding(cls, X, Y=None, encoder=None):
     cols = ['some_id', 'other_id']
     if encoder is None:
         encoder = WeightOfEvidenceEncoder(cols=cols, min_samples=5)
         encoder.fit(X, Y)
     encoded = encoder.transform(X).rename(
         columns={c: 'woe_enc_{}'.format(c)
                  for c in cols})
     return pd.concat([X[cols], encoded], axis=1), encoder
 def test_encode_multiple_cols(self, X, y, expected):
     enc = WeightOfEvidenceEncoder(cols=['cat1', 'cat2'])
     result = enc.fit_transform(pd.DataFrame(X, columns=['cat1', 'cat2']), pd.Series(y))
     assert_array_almost_equal(result, pd.DataFrame(expected), decimal=2)
     ok_('cat1' in enc._mapping)
     ok_('cat2' in enc._mapping)
     ok_(isinstance(enc._mapping['cat1'], pd.DataFrame))
     ok_(isinstance(enc._mapping['cat2'], pd.DataFrame))
     assert_array_equal(enc._mapping['cat1'].index, ['a', 'b'])
     assert_array_equal(enc._mapping['cat2'].index, ['bar', 'foo'])
     assert_array_equal(enc._mapping['cat1'].columns, ['pos', 'count', 'neg', 'value'])
     assert_array_equal(enc._mapping['cat2'].columns, ['pos', 'count', 'neg', 'value'])
 def test_init(self, kwargs, cols, handle_unseen, min_samples):
     enc = WeightOfEvidenceEncoder(**kwargs)
     eq_(enc.cols, cols)
     eq_(enc.handle_unseen, handle_unseen)
     eq_(enc.min_samples, min_samples)
     eq_(enc._imputed, 0)
     eq_(enc._mapping, {})
 def test_transform_error(self, X, y, expected):
     enc = WeightOfEvidenceEncoder(cols=['cat'], handle_unseen='error')
     X = pd.DataFrame(X, columns=['cat'])
     enc.fit(X, pd.Series(y))
     X.iloc[0, 0] = 'foo'
     assert_raises(ValueError, enc.transform, X)
 def test_transform_before_fit(self):
     enc = WeightOfEvidenceEncoder()
     assert_raises(ValueError, enc.transform, 1)
Ejemplo n.º 8
0
                      columns=['sub_grade',
                               'zip_code',
                               'addr_state'])

dumy_df = pd.DataFrame()
dumy_df['Default_Binary'] = df.loan_status.isin([
    'Default',
    'Charged Off',
    'Late (31-120 days)',
    'Does not meet the credit policy. Status:Charged Off'
])
dumy_df['Default_Binary'] = dumy_df.Default_Binary.astype(int)
y = pd.Series(dumy_df.Default_Binary)

encoder = WeightOfEvidenceEncoder(cols=['sub_grade',
                                        'zip_code',
                                        'addr_state'])
df_woe_1 = encoder.fit_transform(df_woe, y)

# Creating encoded data dataset
frames = [encoded_df_2,
          df_woe_1]
encoded_data = pd.concat(objs=frames,
                         axis=1,
                         join='outer',
                         copy=False,
                         sort=False)

# Extracting used columns names
frames = [df_encoder,
          df_woe]