def test_leave_one_out_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='leave_one_out') fm_encoded = enc.fit_transform(feature_matrix, features, feature_matrix['value']) fm_encoded_result = [[7.50001, 5.00001, 2.50001, 20.00001, 15.00001, 8.33333], [True, True, True, True, True, True], [0.00001, 5.00001, 10.00001, 15.00001, 20.00001, 0.00001], [12.50001, 11.250001, 10.00001, 8.750001, 7.50001, 8.33333]] fm_encoded_result = np.swapaxes(fm_encoded_result, 0, 1) np.testing.assert_almost_equal(fm_encoded.values, fm_encoded_result, decimal=1) encoder = LeaveOneOutEnc(fitted_encoder=enc, category='product_id') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [17.5, 8.33333, 5, 5] np.testing.assert_almost_equal(encoded, encoded_results, decimal=4) product_feature = ft.Feature([f1], primitive=LeaveOneOutEnc(enc, 'product_id')) cc_feature = ft.Feature([f4], primitive=LeaveOneOutEnc(enc, 'countrycode')) features = [product_feature, f2, f3, cc_feature] assert features == enc.get_features() features = enc.get_features() feature_matrix_new = ft.calculate_feature_matrix(features, es, instance_ids=ids) new_data = [[5.00001, 5.00001, 5.00001, 17.50001, 17.50001, 8.33333], [True, True, True, True, True, True], [0.00001, 5.00001, 10.00001, 15.00001, 20.00001, 0.00001], [10.00001, 10.00001, 10.00001, 10.00001, 10.00001, 8.33333]] new_result = np.swapaxes(new_data, 0, 1) np.testing.assert_almost_equal(feature_matrix_new.values, new_result, decimal=1)
def test_ordinal_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='ordinal') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = OrdinalEnc(fitted_encoder=enc, category='product_id') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [2, 3, 1, 1] assert (encoded == encoded_results).all() product_feature = ft.Feature([f1], primitive=OrdinalEnc(enc, 0)) cc_feature = ft.Feature([f4], primitive=OrdinalEnc(enc, 1)) features = [product_feature, f2, f3, cc_feature] assert features == enc.get_features() features = enc.get_features() feature_matrix_new = ft.calculate_feature_matrix(features, es, instance_ids=ids) assert (fm_encoded == feature_matrix_new).all().all()
def test_target_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='target') fm_encoded = enc.fit_transform(feature_matrix, features, feature_matrix['value']) encoder = TargetEnc(fitted_encoder=enc, category='product_id') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [15.034704, 8.333333, 5.397343, 5.397343] np.testing.assert_almost_equal(encoded, encoded_results, decimal=5) product_feature = ft.Feature([f1], primitive=TargetEnc(enc, 'product_id')) cc_feature = ft.Feature([f4], primitive=TargetEnc(enc, 'countrycode')) features = [product_feature, f2, f3, cc_feature] assert features == enc.get_features() features = enc.get_features() feature_matrix_new = ft.calculate_feature_matrix(features, es, instance_ids=ids) assert (fm_encoded == feature_matrix_new).all().all()
def test_hashing_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='hashing') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = HashingEnc(fitted_encoder=enc) encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [[0, 0, 0, 0], [1, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] assert (encoded == encoded_results).all() product_feature = ft.Feature([f1], primitive=HashingEnc(enc)) cc_feature = ft.Feature([f4], primitive=HashingEnc(enc)) features = [product_feature, f2, f3, cc_feature] assert len(features) == len(enc.get_features()) for i in range(len(features)): assert features[i].unique_name() == enc.get_features()[i].unique_name() features = enc.get_features() feature_matrix = ft.calculate_feature_matrix(features, es, instance_ids=ids) assert (fm_encoded == feature_matrix).all().all()
def test_one_hot_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() feature_matrix['countrycode'][0] = np.nan enc = Encoder(method='one_hot') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = OneHotEnc(value='coke zero') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [0, 0, 1, 1] assert (encoded == encoded_results).all() encoder = OneHotEnc(value=np.nan) encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero', np.nan]) encoded_results = [0, 0, 0, 0, 1] assert (encoded == encoded_results).all() f1_1 = ft.Feature([f1], primitive=OneHotEnc('coke zero')) f1_2 = ft.Feature([f1], primitive=OneHotEnc('car')) f1_3 = ft.Feature([f1], primitive=OneHotEnc('toothpaste')) f4_1 = ft.Feature([f4], primitive=OneHotEnc('US')) f4_2 = ft.Feature([f4], primitive=OneHotEnc('AL')) f4_3 = ft.Feature([f4], primitive=OneHotEnc(np.nan)) features_encoded = [f1_1, f1_2, f1_3, f2, f3, f4_1, f4_2, f4_3] assert len(features_encoded) == len(enc.get_features()) for i in range(len(features_encoded)): assert features_encoded[i].unique_name() == enc.get_features()[i].unique_name() features_encoded = enc.get_features() feature_matrix = ft.calculate_feature_matrix(features_encoded, es, instance_ids=[6, 7]) data = {'product_id = coke zero': [0, 0], 'product_id = car': [0, 0], 'product_id = toothpaste': [1, 1], 'purchased': [True, True], 'value': [1.0, 2.0], 'countrycode = US': [0, 0], 'countrycode = AL': [1, 1], 'countrycode = nan': [0, 0]} fm_encoded = pd.DataFrame(data, index=[6, 7]) assert feature_matrix.eq(fm_encoded).all().all()
def test_binary_encoding(): feature_matrix, features, f1, f2, f3, f4, es, ids = create_feature_matrix() enc = Encoder(method='binary') fm_encoded = enc.fit_transform(feature_matrix, features) encoder = BinaryEnc(fitted_encoder=enc, category='product_id') encoded = encoder(['car', 'toothpaste', 'coke zero', 'coke zero']) encoded_results = [[0, 0, 0, 0], [1, 1, 0, 0], [0, 1, 1, 1]] assert (encoded == encoded_results).all() product_feature = ft.Feature([f1], primitive=BinaryEnc(enc, 0)) cc_feature = ft.Feature([f4], primitive=BinaryEnc(enc, 1)) features = [product_feature, f2, f3, cc_feature] assert len(features) == len(enc.get_features()) # __eq__ does not support multioutput columns yet for i in range(len(enc.get_features())): assert features[i].unique_name() == enc.get_features()[i].unique_name() features = enc.get_features() feature_matrix = ft.calculate_feature_matrix(features, es, instance_ids=ids) assert (fm_encoded == feature_matrix).all().all()