def test_osvar_seed(caplog): """ Tests random number generator seeding with a system environment variable. """ seed_int = 42 seed_int_random = 2147483648 seed_str = '{}'.format(seed_int) message = 'Seeding RNGs with {}.'.format(seed_str) # Memorise the current state of the system variable fatf_seed = os.environ.get('FATF_SEED', None) os.environ['FATF_SEED'] = seed_str fatf.setup_random_seed() # Check logging # Check that only one message was logged assert len(caplog.records) == 1 # Check this message's log level assert caplog.records[0].levelname == 'INFO' # Check that the message matches assert caplog.records[0].getMessage() == message # Pseudo-check the actual seed assert random.getstate()[1][0] == seed_int_random assert np.random.get_state()[1][0] == seed_int # Restore the system variable if fatf_seed is None: del os.environ['FATF_SEED'] # pragma: nocover else: os.environ['FATF_SEED'] = fatf_seed # pragma: nocover
def test_linear_regressors(self): """ Tests ``SKLearnLinearModelExplainer`` with linear regressors. """ fatf.setup_random_seed() for i, clf in enumerate(LINEAR_REGRESSORS): name = clf.__name__ kwargs = get_kwargs(name) clf_instance = clf(**kwargs) clf_instance.fit(DATA, LABELS) ske = ftsl.SKLearnLinearModelExplainer( clf_instance, feature_names=self.feature_names) # assert ske.clf == clf_instance assert ske.is_classifier is False assert ske.feature_names == self.feature_names assert ske.class_names is None assert ske.features_number == 4 assert ske.classes_array is None coef = ske.feature_importance() if name == 'SGDRegressor': assert np.allclose(coef / 1e+10, LINEAR_REG_COEF[i], atol=1e-3) else: assert np.allclose(coef, LINEAR_REG_COEF[i], atol=1e-3)
def test_linear_classifiers(self): """ Tests ``SKLearnLinearModelExplainer`` with linear classifiers. """ fatf.setup_random_seed() for i, clf in enumerate(LINEAR_CLASSIFIERS): name = clf.__name__ kwargs = get_kwargs(name) clf_instance = clf(**kwargs) clf_instance.fit(DATA, LABELS) ske = ftsl.SKLearnLinearModelExplainer(clf_instance, self.feature_names, self.class_names) # assert ske.clf == clf_instance assert ske.is_classifier is True assert ske.feature_names == self.feature_names assert ske.class_names == self.class_names assert ske.features_number == 4 assert np.array_equal(ske.classes_array, [0, 1]) coef = ske.feature_importance() assert np.allclose(coef, LINEAR_CLF_COEF[i], atol=1e-3)
def test_random_seed(caplog): """ Tests random number generator seeding when the seed is random. """ fatf_seed = os.environ.get('FATF_SEED', None) if fatf_seed is not None: del os.environ['FATF_SEED'] # pragma: nocover assert 'FATF_SEED' not in os.environ fatf.setup_random_seed() seed = np.random.get_state()[1][0] message = 'Seeding RNGs with {}.'.format(seed) # Check logging # Check that only one message was logged assert len(caplog.records) == 1 # Check this message's log level assert caplog.records[0].levelname == 'INFO' # Check that the message matches assert caplog.records[0].getMessage() == message # Check Python random state python_random_seed = random.getstate() random.seed(seed) assert random.getstate() == python_random_seed assert id(random.getstate()) != id(python_random_seed) if fatf_seed is not None: os.environ['FATF_SEED'] = fatf_seed # pragma: nocover assert 'FATF_SEED' in os.environ # pragma: nocover
def test_random_binary_sampler(): """ Tests :func:`fatf.utils.data.instance_augmentation.random_binary_sampler`. """ err_msg = 'The number of elements must be an integer.' with pytest.raises(TypeError) as exin: fudi.random_binary_sampler('int') assert str(exin.value) == err_msg with pytest.raises(TypeError) as exin: fudi.random_binary_sampler(1.0) assert str(exin.value) == err_msg err_msg = 'The number of elements must be greater than 0.' with pytest.raises(ValueError) as exin: fudi.random_binary_sampler(0) assert str(exin.value) == err_msg with pytest.raises(ValueError) as exin: fudi.random_binary_sampler(-42) assert str(exin.value) == err_msg err_msg = 'The number of samples must be an integer.' with pytest.raises(TypeError) as exin: fudi.random_binary_sampler(4, 'int') assert str(exin.value) == err_msg with pytest.raises(TypeError) as exin: fudi.random_binary_sampler(4, 4.2) assert str(exin.value) == err_msg err_msg = 'The number of samples must be greater than 0.' with pytest.raises(ValueError) as exin: fudi.random_binary_sampler(4, 0) assert str(exin.value) == err_msg with pytest.raises(ValueError) as exin: fudi.random_binary_sampler(4, -42) assert str(exin.value) == err_msg fatf.setup_random_seed() sample = fudi.random_binary_sampler(4, 10) sample_ = np.array([[0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 1, 1, 0], [1, 0, 1, 1], [1, 1, 1, 1], [1, 1, 0, 0], [1, 1, 1, 0], [1, 0, 0, 0]]) assert np.array_equal(sample, sample_)
def test_random_seed(caplog): """ Tests random number generator seeding when the seed is random. """ fatf.setup_random_seed() seed = np.random.get_state()[1][0] message = 'Seeding RNGs with {}.'.format(seed) # Check logging # Check that only one message was logged assert len(caplog.records) == 1 # Check this message's log level assert caplog.records[0].levelname == 'INFO' # Check that the message matches assert caplog.records[0].getMessage() == message # Check Python random state python_random_seed = random.getstate() random.seed(seed) assert random.getstate() == python_random_seed assert id(random.getstate()) != id(python_random_seed)
def test_randomise_patch(self): """ Tests :func:`fatf.utils.data.occlusion.Occlusion._randomise_patch`. """ fatf.setup_random_seed() mask_ = np.array([[1, 0], [0, 1]], dtype=bool) # Colour occlusion = fudo.Occlusion(ARRAY_IMAGE_3D, SEGMENTS) assert np.array_equal( occlusion._randomise_patch(mask_), np.array([[125, 114, 71], [52, 44, 216]], dtype=np.uint8)) # ..check the default assert np.array_equal( occlusion._colouring_strategy(ONES), occlusion._generate_colouring_strategy('mean')(ONES)) # Grayscale occlusion = fudo.Occlusion(ARRAY_IMAGE_2D, SEGMENTS) assert np.array_equal( occlusion._randomise_patch(mask_), np.array([119, 13], dtype=np.uint8)) # ..check the default assert np.array_equal( occlusion._colouring_strategy(ONES), occlusion._generate_colouring_strategy('mean')(ONES)) # Black-and-white occlusion = fudo.Occlusion( np.array([[0, 255], [255, 0]], dtype=np.uint8), SEGMENTS) assert np.array_equal( occlusion._randomise_patch(mask_), np.array([0, 255], dtype=np.uint8)) # ..check the default assert np.array_equal( occlusion._colouring_strategy(ONES), occlusion._generate_colouring_strategy('black')(ONES))
import matplotlib.pyplot as plt import numpy as np import fatf import fatf.utils.models as fatf_models import fatf.transparency.predictions.surrogate_image_explainers as fatf_exp import fatf.vis.lime as fatf_vis_lime print(__doc__) # Fix random seed fatf.setup_random_seed(42) # Create a simple data set r, g, b, k = [255, 0, 0], [0, 255, 0], [0, 0, 255], [0, 0, 0] X = np.array([[[r, g], [b, k]], [[r, b], [g, k]], [[r, k], [b, g]], [[k, g], [b, r]], [[k, b], [g, r]], [[g, k], [b, r]]], dtype=np.uint8) y = np.array([0, 0, 0, 1, 1, 1]) feature_names = { 'Segment #1': 'top-left', 'Segment #2': 'top-right', 'Segment #3': 'bottom-left', 'Segment #4': 'bottom-right' } class_names = {0: 'top-left-red', 1: 'bottom-right-red'}
def test_linear_classifier_coefficients(): """ Tests linear scikit-learn classifier coefficient extraction. Tests :func:`fatf.transparency.sklearn.linear_model.\ linear_classifier_coefficients` function. """ fatf.setup_random_seed() type_error = ('This functionality is designated for linear-like ' 'scikit-learn predictor instances only. Instead got: {}.') unfit_error = ("This {} instance is not fitted yet. Call 'fit' with " 'appropriate arguments before using this method.') for clf in NON_LINEAR_MODELS: clf_instance = clf() clf_instance.fit(DATA, LABELS) with pytest.raises(TypeError) as excinfo: ftsl.linear_classifier_coefficients(clf_instance) name = str(clf).strip("<>' ")[7:] assert str(excinfo.value) == type_error.format(name) for i, clf in enumerate(LINEAR_REGRESSORS): name = clf.__name__ kwargs = get_kwargs(name) clf_instance = clf(**kwargs) with pytest.raises(sklearn.exceptions.NotFittedError) as excinfo: ftsl.linear_classifier_coefficients(clf_instance) msg = unfit_error.format(clf_instance.__class__.__name__) assert str(excinfo.value) == msg clf_instance.fit(DATA, LABELS) coef = ftsl.linear_classifier_coefficients(clf_instance) if name == 'SGDRegressor': assert np.allclose(coef / 1e+10, LINEAR_REG_COEF[i], atol=1e-3) else: assert np.allclose(coef, LINEAR_REG_COEF[i], atol=1e-3) for i, clf in enumerate(LINEAR_CLASSIFIERS): name = clf.__name__ kwargs = get_kwargs(name) clf_instance = clf(**kwargs) with pytest.raises(sklearn.exceptions.NotFittedError) as excinfo: ftsl.linear_classifier_coefficients(clf_instance) msg = unfit_error.format(clf_instance.__class__.__name__) assert str(excinfo.value) == msg clf_instance.fit(DATA, LABELS) coef = ftsl.linear_classifier_coefficients(clf_instance) assert np.allclose(coef, LINEAR_CLF_COEF[i], atol=1e-3) for i, clf in enumerate(LINEAR_MULTITASK_REGRESSORS): name = clf.__name__ kwargs = get_kwargs(name) clf_instance = clf(**kwargs) with pytest.raises(sklearn.exceptions.NotFittedError) as excinfo: ftsl.linear_classifier_coefficients(clf_instance) msg = unfit_error.format(clf_instance.__class__.__name__) assert str(excinfo.value) == msg clf_instance.fit(DATA, LABELS_MULTITASK) coef = ftsl.linear_classifier_coefficients(clf_instance) assert np.allclose(coef, LINEAR_MUL_REG_COEF[i], atol=1e-3)
def test_local_fidelity_score(): """ Tests the ``local_fidelity_score`` function. This function tests the :func:`fatf.utils.transparency.surrogate_evaluation.local_fidelity_score` function. """ accuracy_warning = ('Some of the given labels are not present in either ' 'of the input arrays: {}.') fatf.setup_random_seed() def accuracy(global_predictions, local_predictions): global_predictions[global_predictions >= 0.5] = 1 global_predictions[global_predictions < 0.5] = 0 local_predictions[local_predictions >= 0.5] = 1 local_predictions[local_predictions < 0.5] = 0 confusion_matrix = fumt.get_confusion_matrix(global_predictions, local_predictions, labels=[0, 1]) accuracy = fummet.accuracy(confusion_matrix) return accuracy def accuracy_prob(global_predictions, local_predictions, global_proba=True, local_proba=True): if global_proba: global_predictions = np.argmax(global_predictions, axis=1) if local_proba: local_predictions = np.argmax(local_predictions, axis=1) confusion_matrix = fumt.get_confusion_matrix(global_predictions, local_predictions, labels=[0, 1, 2]) accuracy = fummet.accuracy(confusion_matrix) return accuracy def accuracy_proba_np(global_predictions, local_predictions): return accuracy_prob(global_predictions, local_predictions, global_proba=False, local_proba=True) def accuracy_proba_nn(global_predictions, local_predictions): return accuracy_prob(global_predictions, local_predictions, global_proba=False, local_proba=False) def reg_dist(global_predictions, local_predictions): return (global_predictions - local_predictions).sum() predictor = fumm.KNN(k=3) predictor.fit(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET) regressor = fumm.KNN(k=3, mode='regressor') regressor.fit(NUMERICAL_NP_ARRAY_LOCAL, NUMERICAL_NP_ARRAY_LOCAL_TARGET) regressor_23 = fumm.KNN(k=3, mode='regressor') regressor_23.fit(NUMERICAL_NP_ARRAY_LOCAL[:, [2, 3]], NUMERICAL_NP_ARRAY_LOCAL_TARGET) # Structured array predictor_struct = fumm.KNN(k=3) predictor_struct.fit(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET) # regressor_struct_cd = fumm.KNN(k=3, mode='regressor') regressor_struct_cd.fit(NUMERICAL_STRUCT_ARRAY_LOCAL[['c', 'd']], NUMERICAL_NP_ARRAY_LOCAL_TARGET) # Global: probabilistic... # ...local: regressor comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], predictor.predict_proba, regressor.predict, accuracy, 2) assert np.isclose(comparison, 0.26) # ...local: classifier comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], predictor.predict_proba, predictor.predict, accuracy, 2) assert np.isclose(comparison, 1.0) # ...local: probabilistic with pytest.warns(UserWarning) as w: comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], predictor.predict_proba, predictor.predict_proba, accuracy_prob) assert len(w) == 1 assert str(w[0].message) == accuracy_warning.format(set([1])) assert np.isclose(comparison, 1.0) # Global: classifier... # ...local: probabilistic with pytest.warns(UserWarning) as w: comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], predictor.predict, predictor.predict_proba, accuracy_proba_np) assert len(w) == 1 assert str(w[0].message) == accuracy_warning.format(set([1])) assert np.isclose(comparison, 1.0) # ...local: classifier with pytest.warns(UserWarning) as w: comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], predictor.predict, predictor.predict, accuracy_proba_nn) assert len(w) == 1 assert str(w[0].message) == accuracy_warning.format(set([1])) assert np.isclose(comparison, 1.0) # Global: regressor... # ...local: regressor comparison = futs.local_fidelity_score(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY[0], regressor.predict, regressor_23.predict, reg_dist, explained_feature_indices=[2, 3]) assert np.isclose(comparison, 0) # Structured array # Global: probabilistic... # ...local: regressor comparison = futs.local_fidelity_score( NUMERICAL_STRUCT_ARRAY, NUMERICAL_STRUCT_ARRAY[0], predictor_struct.predict_proba, regressor_struct_cd.predict, accuracy, 0, explained_feature_indices=['c', 'd']) assert np.isclose(comparison, 0.94)
def test_submodular_pick(): """Tests :func:`fatf.transparency.models.submodular_pick`.""" fatf.setup_random_seed() explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, explanations_number=2) assert explanation_ind == [0, 2] assert explanations == [EXPLAINERS[0], EXPLAINERS[2]] explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_b, explanations_number=2) assert explanation_ind == [0, 1] assert explanations == [EXPLAINERS[3], EXPLAINERS[2]] msg = ('sample_size is larger than the number of samples in the data set. ' 'The whole dataset will be used.') with pytest.warns(UserWarning) as warning: explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, sample_size=100, explanations_number=1) assert len(warning) == 1 assert str(warning[0].message) == msg assert explanation_ind == [0] assert explanations == [EXPLAINERS[0]] explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, sample_size=1, explanations_number=1) assert explanation_ind == [1] assert explanations == [EXPLAINERS[1]] explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, sample_size=0, explanations_number=0) assert explanation_ind == [0, 2, 1, 3] assert explanations == [ EXPLAINERS[0], EXPLAINERS[2], EXPLAINERS[1], EXPLAINERS[3] ] explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, sample_size=2, explanations_number=0) assert explanation_ind == [3, 1] assert explanations == [EXPLAINERS[3], EXPLAINERS[1]] msg = ('The number of explanations cannot be larger than ' 'the number of instances (rows) in the data set.') with pytest.warns(UserWarning) as warning: explanations, explanation_ind = ftms.submodular_pick( NUMERICAL_NP_ARRAY, explain_instance_a, 0, 222) assert len(warning) == 1 assert str(warning[0].message) == msg assert explanation_ind == [0, 2, 1, 3] assert explanations == [ EXPLAINERS[0], EXPLAINERS[2], EXPLAINERS[1], EXPLAINERS[3] ]
def test_sample(self): """ Tests :func:`~fatf.utils.data.augmentation.Mixup.sample` method. """ user_warning_gt = ( 'This Mixup class has not been initialised with a ground truth ' 'vector. The value of the data_row_target parameter will be ' 'ignored, therefore target values samples will not be returned.') user_warning_strat = ( 'Since the ground truth vector was not provided while ' 'initialising the Mixup class it is not possible to get a ' 'stratified sample of data points. Instead, Mixup will choose ' 'data points at random, which is equivalent to assuming that the ' 'class distribution is balanced.') fatf.setup_random_seed() # Mixed array with ground truth and probabilities samples = self.mixed_augmentor_i2f.sample(MIXED_ARRAY[0], 0, 5, return_probabilities=True) assert len(samples) == 2 answer_sample = np.array( [(0.000, 'a', 0.332, 'a'), (0.000, 'a', 0.080, 'a'), (0.780, 'a', 0.587, 'a'), (0.992, 'a', 0.725, 'a'), (0.734, 'a', 0.073, 'a')], dtype=[('a', '<f4'), ('b', '<U1'), ('c', '<f4'), ('d', '<U2')]) # yapf: disable answer_sample_gt = np.array([[1, 0], [1, 0], [1, 0], [1, 0], [0.266, 0.734]]) assert np.allclose(samples[1], answer_sample_gt, atol=1e-3) for i in ['a', 'c']: assert np.allclose(samples[0][i], answer_sample[i], atol=1e-3) for i in ['b', 'd']: assert np.array_equal(samples[0][i], answer_sample[i]) # Mixed array with ground truth and probabilities samples = self.mixed_augmentor.sample(MIXED_ARRAY[0], 1, 5, return_probabilities=True) assert len(samples) == 2 answer_sample = np.array( [(0, 'a', 0.829, 'a'), (0, 'a', 0.601, 'a'), (0, 'a', 0.255, 'a'), (0, 'a', 0.377, 'a'), (0, 'a', 0.071, 'a')], dtype=[('a', '<i4'), ('b', '<U1'), ('c', '<f4'), ('d', '<U2')]) # yapf: disable answer_sample_gt = np.array([[0.823, 0.177], [0.802, 0.198], [0.624, 0.376], [0.457, 0.543], [0, 1]]) assert np.allclose(samples[1], answer_sample_gt, atol=1e-3) for i in ['a', 'c']: assert np.allclose(samples[0][i], answer_sample[i], atol=1e-3) for i in ['b', 'd']: assert np.array_equal(samples[0][i], answer_sample[i]) # Numpy array without ground truth -- categorical with pytest.warns(UserWarning) as warning: samples = self.categorical_np_augmentor.sample( CATEGORICAL_NP_ARRAY[0], samples_number=5) assert len(warning) == 1 assert str(warning[0].message) == user_warning_strat # answer_sample = np.array([['a', 'b', 'c'], ['a', 'b', 'c'], ['a', 'b', 'c'], ['a', 'b', 'c'], ['a', 'b', 'c']]) assert np.array_equal(samples, answer_sample) # Numpy array without ground truth -- numerical -- test for warning with pytest.warns(UserWarning) as warning: samples = self.numerical_np_augmentor.sample(NUMERICAL_NP_ARRAY[0], data_row_target=1, samples_number=5) assert len(warning) == 2 assert str(warning[0].message) == user_warning_gt assert str(warning[1].message) == user_warning_strat # answer_sample = np.array([[0.792, 0.000, 0.040, 0.373], [0.000, 0.000, 0.080, 0.690], [1.220, 0.610, 0.476, 0.562], [0.000, 0.000, 0.080, 0.690], [1.389, 0.694, 0.531, 0.544]]) assert np.allclose(samples, answer_sample, atol=1e-3) # Structured array with ground truth -- numerical -- no probabilities samples = self.numerical_struct_augmentor.sample( NUMERICAL_STRUCT_ARRAY[0], samples_number=5, data_row_target='b') assert len(samples) == 2 answer_sample = np.array( [(0, 0, 0.039, 0.358), (1, 0, 0.544, 0.540), (1, 0, 0.419, 0.580), (0, 0, 0.080, 0.690), (0, 0, 0.080, 0.690)], dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<f4'), ('d', '<f4')]) # yapf: disable answer_sample_gt = np.array(['a', 'a', 'a', 'b', 'b']) assert np.array_equal(samples[1], answer_sample_gt) for index in ['a', 'b', 'c', 'd']: assert np.allclose(samples[0][index], answer_sample[index], atol=1e-3)
def test_highest_weights(caplog): """ Tests :func:`fatf.utils.data.feature_choice.sklearn.highest_weights`. """ assert len(caplog.records) == 0 fatf.setup_random_seed() assert len(caplog.records) == 2 assert caplog.records[0].levelname == 'INFO' assert caplog.records[0].getMessage().startswith('Seeding RNGs ') assert caplog.records[1].levelname == 'INFO' assert caplog.records[1].getMessage() == 'Seeding RNGs with 42.' # Weights and no-weights weights = np.ones((NUMERICAL_NP_ARRAY.shape[0], )) # Classic array -- weights features = fudfs.highest_weights(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([1, 2])) # Structured array -- no-weights features = fudfs.highest_weights( NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_number=2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['b', 'c'])) # # Selecting exactly 4 features -- no need for Lasso features = fudfs.highest_weights(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 4) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1, 2, 3])) # Selecting more than 4 features with pytest.warns(UserWarning) as warning: features = fudfs.highest_weights(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 5) assert len(warning) == 1 assert str(warning[0].message) == FEATURE_INDICES_WARNING assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['a', 'b', 'c', 'd'])) # # No features number -- just percentage features = fudfs.highest_weights( NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=50) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([1, 2])) # No features number -- just percentage -- too small no features selected assert len(caplog.records) == 2 features = fudfs.highest_weights( NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=24) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([2])) assert len(caplog.records) == 3 assert caplog.records[2].levelname == 'WARNING' assert caplog.records[2].getMessage() == FEATURE_PERCENTAGE_LOG # Small weights weights = np.array([1, 1, 100, 1, 1, 1]) * 1e-20 features = fudfs.highest_weights(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1])) # Another selection weights = np.array([100, 1, 1, 1, 1, 1]) features = fudfs.highest_weights(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([2, 3])) features = fudfs.highest_weights(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['c', 'd'])) # Custom data features = fudfs.highest_weights( np.array([[1, 2, 3], [2, 2, 3], [3, 2, 3], [4, 2, 3]]), np.array([1, 2, 3, 4]), features_number=2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 2])) assert len(caplog.records) == 3
def test_lasso_path(caplog): """ Tests :func:`fatf.utils.data.feature_choice.sklearn.lasso_path` function. """ no_lasso_log = ('The lasso path feature selection could not pick any ' 'feature subset. All of the features were selected.') less_lasso_log = ('The lasso path feature selection could not pick {} ' 'features. Only {} were selected.') assert len(caplog.records) == 0 fatf.setup_random_seed() assert len(caplog.records) == 2 assert caplog.records[0].levelname == 'INFO' assert caplog.records[0].getMessage().startswith('Seeding RNGs ') assert caplog.records[1].levelname == 'INFO' assert caplog.records[1].getMessage() == 'Seeding RNGs with 42.' # Weights and no-weights weights = np.ones((NUMERICAL_NP_ARRAY.shape[0], )) # Classic array -- weights features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1])) # Structured array -- no-weights features = fudfs.lasso_path( NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_number=2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['a', 'b'])) # # Selecting exactly 4 features -- no need for Lasso features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 4) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1, 2, 3])) # Selecting more than 4 features with pytest.warns(UserWarning) as warning: features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 5) assert len(warning) == 1 assert str(warning[0].message) == FEATURE_INDICES_WARNING assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['a', 'b', 'c', 'd'])) # # No features number -- just percentage features = fudfs.lasso_path( NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=50) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1])) # No features number -- just percentage -- too small no features selected assert len(caplog.records) == 2 features = fudfs.lasso_path( NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=24) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0])) assert len(caplog.records) == 3 assert caplog.records[2].levelname == 'WARNING' assert caplog.records[2].getMessage() == FEATURE_PERCENTAGE_LOG # Weights too small so no path is found -- returns all features weights = np.array([1, 1, 100, 1, 1, 1]) * 1e-20 assert len(caplog.records) == 3 features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 1, 2, 3])) assert len(caplog.records) == 4 assert caplog.records[3].levelname == 'WARNING' assert caplog.records[3].getMessage() == no_lasso_log # Another selection weights = np.array([1, 1, 100, 1, 1, 1]) features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0, 2])) features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array(['a', 'c'])) # Lasso with no possibility of reducing the number of features assert len(caplog.records) == 4 features = fudfs.lasso_path( np.array([[1, 2, 3], [2, 2, 3], [3, 2, 3], [4, 2, 3]]), np.array([1, 2, 3, 4]), features_number=2) assert fuav.is_1d_array(features) assert np.array_equal(features, np.array([0])) assert len(caplog.records) == 5 assert caplog.records[4].levelname == 'WARNING' assert caplog.records[4].getMessage() == less_lasso_log.format(2, 1)
def test_generate_colouring_strategy(self): """ Tests :func:`fatf.utils.data.occlusion.Occlusion.\ _generate_colouring_strategy`. """ occlusion = fudo.Occlusion(ARRAY_IMAGE_3D, SEGMENTS) # Errors msg = ('The colour can either be a string specifier; or ' 'an RGB thriplet for RGB images and an integer ' 'for or grayscale and black-and-white images.') with pytest.raises(TypeError) as exin: occlusion._generate_colouring_strategy(['list']) assert str(exin.value) == msg # int for colour with pytest.raises(TypeError) as exin: occlusion._generate_colouring_strategy(33) assert str(exin.value) == msg # tuple for grayscale/black-and-white occlusion = fudo.Occlusion(ARRAY_IMAGE_2D, SEGMENTS) with pytest.raises(TypeError) as exin: occlusion._generate_colouring_strategy((4, 2, 0)) assert str(exin.value) == msg with pytest.raises(TypeError) as exin: occlusion._generate_colouring_strategy(2.0) assert str(exin.value) == msg # Colour occlusion = fudo.Occlusion(ARRAY_IMAGE_3D, SEGMENTS) # string msg = ('Unknown colouring strategy name: colour.\n' "Choose one of the following: ['black', 'blue', 'green', " "'mean', 'pink', 'random', 'random-patch', 'randomise', " "'randomise-patch', 'red', 'white'].") with pytest.raises(ValueError) as exin: occlusion._generate_colouring_strategy('colour') assert str(exin.value) == msg # functional -- mean clr = occlusion._generate_colouring_strategy(None)(ONES) assert np.array_equal(clr, np.ones(shape=(2, 2, 2, 3), dtype=np.uint8)) clr = occlusion._generate_colouring_strategy('mean')(ONES) assert np.array_equal(clr, np.ones(shape=(2, 2, 2, 3), dtype=np.uint8)) one_ = np.zeros(shape=(2, 2), dtype=bool) one_[1, 1] = True fatf.setup_random_seed() # functional -- random clr = occlusion._generate_colouring_strategy('random')(ONES) assert np.array_equal(clr, (57, 12, 140)) # functional -- random-patch clr = occlusion._generate_colouring_strategy('random-patch')(one_) assert np.array_equal(clr, np.array([[16, 15, 47]], dtype=np.uint8)) # functional -- randomise clr = occlusion._generate_colouring_strategy('randomise')(one_) assert np.array_equal(clr, (101, 214, 112)) # functional -- randomise-patch clr = occlusion._generate_colouring_strategy('randomise-patch')(one_) assert np.array_equal(clr, np.array([[81, 216, 174]], dtype=np.uint8)) # functional -- black clr = occlusion._generate_colouring_strategy('black')(one_) assert np.array_equal(clr, (0, 0, 0)) # functional -- white clr = occlusion._generate_colouring_strategy('white')(one_) assert np.array_equal(clr, (255, 255, 255)) # functional -- red clr = occlusion._generate_colouring_strategy('red')(one_) assert np.array_equal(clr, (255, 0, 0)) # functional -- green clr = occlusion._generate_colouring_strategy('green')(one_) assert np.array_equal(clr, (0, 255, 0)) # functional -- blue clr = occlusion._generate_colouring_strategy('blue')(one_) assert np.array_equal(clr, (0, 0, 255)) # functional -- pink clr = occlusion._generate_colouring_strategy('pink')(one_) assert np.array_equal(clr, (255, 192, 203)) # tuple clr = occlusion._generate_colouring_strategy((42, 24, 242))(one_) assert np.array_equal(clr, (42, 24, 242)) # Grayscale occlusion = fudo.Occlusion(ARRAY_IMAGE_2D, SEGMENTS) # int msg = ('Unknown colouring strategy name: colour.\n' "Choose one of the following: ['black', 'mean', 'random', " "'random-patch', 'randomise', 'randomise-patch', 'white'].") with pytest.raises(ValueError) as exin: occlusion._generate_colouring_strategy('colour') assert str(exin.value) == msg msg = ('The colour should be an integer between ' '0 and 255 for grayscale images.') with pytest.raises(ValueError) as exin: occlusion._generate_colouring_strategy(-1) assert str(exin.value) == msg with pytest.raises(ValueError) as exin: occlusion._generate_colouring_strategy(256) assert str(exin.value) == msg clr = occlusion._generate_colouring_strategy(42)(one_) assert clr == 42 # string clr = occlusion._generate_colouring_strategy(None)(ONES) assert np.array_equal( clr, np.array([[[85, 2], [85, 2]], [[85, 2], [85, 2]]], dtype=np.uint8)) clr = occlusion._generate_colouring_strategy('mean')(ONES) assert np.array_equal( clr, np.array([[[85, 2], [85, 2]], [[85, 2], [85, 2]]], dtype=np.uint8)) fatf.setup_random_seed() # functional -- random clr = occlusion._generate_colouring_strategy('random')(ONES) assert clr == 57 # functional -- random-patch clr = occlusion._generate_colouring_strategy('random-patch')(one_) assert np.array_equal(clr, np.array([125], dtype=np.uint8)) # functional -- randomise clr = occlusion._generate_colouring_strategy('randomise')(one_) assert clr == 71 # functional -- randomise-patch clr = occlusion._generate_colouring_strategy('randomise-patch')(one_) assert np.array_equal(clr, np.array([44], dtype=np.uint8)) # functional -- black clr = occlusion._generate_colouring_strategy('black')(one_) assert clr == 0 # functional -- white clr = occlusion._generate_colouring_strategy('white')(one_) assert clr == 255 # Black-and-white occlusion = fudo.Occlusion( np.array([[0, 255], [0, 255]], dtype=np.uint8), SEGMENTS) # int msg = ('The colour should be 0 for black, or 1 or 255 for ' 'white for black-and-white images.') with pytest.raises(ValueError) as exin: occlusion._generate_colouring_strategy(42) assert str(exin.value) == msg clr = occlusion._generate_colouring_strategy(0)(one_) assert clr == 0 clr = occlusion._generate_colouring_strategy(1)(one_) assert clr == 255 clr = occlusion._generate_colouring_strategy(255)(one_) assert clr == 255 # string msg = 'Mean occlusion is not supported for black-and-white images.' with pytest.raises(RuntimeError) as exin: occlusion._generate_colouring_strategy(None) assert str(exin.value) == msg with pytest.raises(RuntimeError) as exin: occlusion._generate_colouring_strategy('mean') assert str(exin.value) == msg fatf.setup_random_seed() # functional -- random clr = occlusion._generate_colouring_strategy('random')(ONES) assert clr == 0 # functional -- random-patch clr = occlusion._generate_colouring_strategy('random-patch')(one_) assert np.array_equal(clr, np.array([0], dtype=np.uint8)) # functional -- randomise clr = occlusion._generate_colouring_strategy('randomise')(one_) assert clr == 0 # functional -- randomise-patch clr = occlusion._generate_colouring_strategy('randomise-patch')(one_) assert np.array_equal(clr, np.array([0], dtype=np.uint8)) # functional -- black clr = occlusion._generate_colouring_strategy('black')(one_) assert clr == 0 # functional -- white clr = occlusion._generate_colouring_strategy('white')(one_) assert clr == 255
def test_lasso_path(caplog): """ Tests :func:`fatf.utils.data.feature_choice.sklearn.lasso_path` function. """ feature_indices_warning = ('The selected number of features is larger ' 'than the total number of features in the ' 'dataset array. All of the features are being ' 'selected.') feature_percentage_log = ( 'Since the number of features to be extracted was not given 24% of ' 'features will be used. This percentage translates to 0 features, ' 'therefore the number of features to be used is overwritten to 1. To ' 'prevent this from happening, you should either explicitly set the ' 'number of features via the features_number parameter or increase the ' 'value of the features_percentage parameter.') no_lasso_log = ('The lasso path feature selection could not pick any ' 'feature subset. All of the features were selected.') less_lasso_log = ('The lasso path feature selection could not pick {} ' 'features. Only {} were selected.') assert len(caplog.records) == 0 fatf.setup_random_seed() assert len(caplog.records) == 2 assert caplog.records[0].levelname == 'INFO' assert caplog.records[0].getMessage().startswith('Seeding RNGs ') assert caplog.records[1].levelname == 'INFO' assert caplog.records[1].getMessage() == 'Seeding RNGs with 42.' # Weights and no-weights weights = np.ones((NUMERICAL_NP_ARRAY.shape[0], )) # Classic array -- weights features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert np.array_equal(features, np.array([0, 1])) # Structured array -- no-weights features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_number=2) assert np.array_equal(features, np.array(['a', 'b'])) # # Selecting exactly 4 features -- no need for Lasso features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 4) assert np.array_equal(features, np.array([0, 1, 2, 3])) # Selecting more than 4 features with pytest.warns(UserWarning) as warning: features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 5) assert len(warning) == 1 assert str(warning[0].message) == feature_indices_warning assert np.array_equal(features, np.array(['a', 'b', 'c', 'd'])) # # No features number -- just percentage features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=50) assert np.array_equal(features, np.array([0, 1])) # No features number -- just percentage -- too small no features selected assert len(caplog.records) == 2 features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, features_percentage=24) assert np.array_equal(features, np.array([0])) assert len(caplog.records) == 3 assert caplog.records[2].levelname == 'WARNING' assert caplog.records[2].getMessage() == feature_percentage_log # Weights too small so no path is found -- returns all features weights = np.array([1, 1, 100, 1, 1, 1]) * 1e-20 assert len(caplog.records) == 3 features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert np.array_equal(features, np.array([0, 1, 2, 3])) assert len(caplog.records) == 4 assert caplog.records[3].levelname == 'WARNING' assert caplog.records[3].getMessage() == no_lasso_log # Another selection weights = np.array([1, 1, 100, 1, 1, 1]) features = fudfs.lasso_path(NUMERICAL_NP_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert np.array_equal(features, np.array([0, 2])) features = fudfs.lasso_path(NUMERICAL_STRUCT_ARRAY, NUMERICAL_NP_ARRAY_TARGET, weights, 2) assert np.array_equal(features, np.array(['a', 'c'])) # Lasso with no possibility of reducing the number of features assert len(caplog.records) == 4 features = fudfs.lasso_path(np.array([[1, 2, 3], [2, 2, 3], [3, 2, 3], [4, 2, 3]]), np.array([1, 2, 3, 4]), features_number=2) assert len(caplog.records) == 5 assert caplog.records[4].levelname == 'WARNING' assert caplog.records[4].getMessage() == less_lasso_log.format(2, 1)
def test_binary_sampler(): """ Tests :func:`fatf.utils.data.instance_augmentation.binary_sampler`. """ fatf.setup_random_seed() binary_msg = 'The data_row is not binary.' proportions = [0.5, 0., 0.5, 0.5] numerical_binary_array = np.array([1, 0, 1, 1]) numerical_binary_array_sampled = np.array([ [0, 0, 0, 0], [1, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0] ]) # yapf: disable struct_dtype = [('a', 'i'), ('b', 'i'), ('c', 'f'), ('d', bool)] numerical_binary_struct_array = np.array([(1, 0, 1., True)], dtype=struct_dtype) numerical_binary_struct_array = numerical_binary_struct_array[0] numerical_binary_struct_array_sampled = np.array( [(1, 0, 0., False), (0, 0, 0., True), (1, 0, 0., True), (1, 0, 1., True), (1, 0, 0., False)], dtype=struct_dtype) # yapf: disable with pytest.raises(ValueError) as exin: fudi.binary_sampler(np.array([0, 1, 2, 3])) assert str(exin.value) == binary_msg with pytest.raises(ValueError) as exin: fudi.binary_sampler(np.array([0., 0.5, 0.5, 0.2])) assert str(exin.value) == binary_msg with pytest.raises(ValueError) as exin: fudi.binary_sampler(CATEGORICAL_STRUCT_ARRAY[0]) assert str(exin.value) == binary_msg with pytest.raises(ValueError) as exin: fudi.binary_sampler(MIXED_ARRAY[0]) assert str(exin.value) == binary_msg # samples = fudi.binary_sampler(numerical_binary_array, samples_number=5) assert np.array_equal(samples, numerical_binary_array_sampled) samples = fudi.binary_sampler(numerical_binary_array, samples_number=1000) assert np.allclose( samples.sum(axis=0) / samples.shape[0], proportions, atol=1e-1) samples = fudi.binary_sampler( numerical_binary_struct_array, samples_number=5) assert np.array_equal(samples, numerical_binary_struct_array_sampled) assert fuav.are_similar_dtype_arrays( np.asarray(numerical_binary_struct_array), samples, True) samples = fudi.binary_sampler( numerical_binary_struct_array, samples_number=1000) for i, name in enumerate(numerical_binary_struct_array.dtype.names): assert np.allclose( samples[name].sum() / samples[name].shape[0], proportions[i], atol=1e-1) assert fuav.are_similar_dtype_arrays( np.asarray(numerical_binary_struct_array), samples, True)
def test_sample(self): """ Tests :func:`~fatf.utils.data.augmentation.NormalSampling.sample`. """ fatf.setup_random_seed() # Pure numerical sampling of a data point # ...numpy array results samples = self.numerical_np_augmentor.sample(NUMERICAL_NP_ARRAY[0, :], samples_number=3) assert np.allclose(samples, NUMERICAL_NP_RESULTS, atol=1e-3) # ...structured array results samples_struct = self.numerical_struct_augmentor.sample( NUMERICAL_STRUCT_ARRAY[0], samples_number=3) for i in samples_struct.dtype.names: assert np.allclose(samples_struct[i], NUMERICAL_STRUCT_RESULTS[i], atol=1e-3) # ...numpy array results mean samples = self.numerical_np_augmentor.sample(NUMERICAL_NP_ARRAY[0, :], samples_number=1000) assert np.allclose(samples.mean(axis=0), NUMERICAL_NP_ARRAY[0, :], atol=1e-1) assert np.allclose(samples.std(axis=0), NUMERICAL_NP_ARRAY.std(axis=0), atol=1e-1) # ...structured array results mean samples_struct = self.numerical_struct_augmentor.sample( NUMERICAL_STRUCT_ARRAY[0], samples_number=1000) for i in samples_struct.dtype.names: assert np.allclose(np.mean(samples_struct[i]), NUMERICAL_STRUCT_ARRAY[0][i], atol=1e-1) assert np.allclose(np.std(samples_struct[i]), np.std(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) # Pure numerical sampling of the mean of the data # ...numpy array mean samples = self.numerical_np_augmentor.sample(samples_number=1000) assert np.allclose(samples.mean(axis=0), NUMERICAL_NP_ARRAY.mean(axis=0), atol=1e-1) assert np.allclose(samples.std(axis=0), NUMERICAL_NP_ARRAY.std(axis=0), atol=1e-1) # ...structured array mean samples_struct = self.numerical_struct_augmentor.sample( samples_number=1000) for i in samples_struct.dtype.names: assert np.allclose(np.mean(samples_struct[i]), np.mean(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) assert np.allclose(np.std(samples_struct[i]), np.std(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) ####################################################################### # Numerical sampling with one categorical index defined # ...numpy array results samples = self.numerical_np_0_augmentor.sample( NUMERICAL_NP_ARRAY[0, :], samples_number=3) assert np.allclose(samples, NUMERICAL_NP_CAT_RESULTS, atol=1e-3) # ...structured array results samples_struct = self.numerical_struct_a_augmentor.sample( NUMERICAL_STRUCT_ARRAY[0], samples_number=3) for i in samples_struct.dtype.names: assert np.allclose(samples_struct[i], NUMERICAL_STRUCT_CAT_RESULTS[i], atol=1e-3) # ...numpy array results mean samples = self.numerical_np_0_augmentor.sample( NUMERICAL_NP_ARRAY[0, :], samples_number=100) # ......numerical assert np.allclose(samples.mean(axis=0)[1:], NUMERICAL_NP_ARRAY[0, 1:], atol=1e-1) assert np.allclose(samples.std(axis=0)[1:], NUMERICAL_NP_ARRAY.std(axis=0)[1:], atol=1e-1) # ......categorical val, freq = np.unique(samples[:, 0], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, NUMERICAL_NP_0_CAT_VAL) assert np.allclose(freq, NUMERICAL_NP_0_CAT_FREQ, atol=1e-1) # ...structured array results mean samples_struct = self.numerical_struct_a_augmentor.sample( NUMERICAL_STRUCT_ARRAY[0], samples_number=100) # ......numerical for i in samples_struct.dtype.names[1:]: assert np.allclose(np.mean(samples_struct[i]), NUMERICAL_STRUCT_ARRAY[0][i], atol=1e-1) assert np.allclose(np.std(samples_struct[i]), np.std(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) # ......categorical val_struct, freq_struct = np.unique(samples_struct['a'], return_counts=True) freq_struct = freq_struct / freq_struct.sum() assert np.array_equal(val_struct, NUMERICAL_NP_0_CAT_VAL) assert np.allclose(freq_struct, NUMERICAL_NP_0_CAT_FREQ, atol=1e-1) # ...numpy array mean samples = self.numerical_np_0_augmentor.sample(samples_number=1000) # ......numerical assert np.allclose(samples.mean(axis=0)[1:], NUMERICAL_NP_ARRAY.mean(axis=0)[1:], atol=1e-1) # ......categorical val, freq = np.unique(samples[:, 0], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, NUMERICAL_NP_0_CAT_VAL) assert np.allclose(freq, NUMERICAL_NP_0_CAT_FREQ, atol=1e-1) # ...structured array mean samples_struct = self.numerical_struct_a_augmentor.sample( samples_number=1000) # ......numerical for i in samples_struct.dtype.names[1:]: assert np.allclose(np.mean(samples_struct[i]), np.mean(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) assert np.allclose(np.std(samples_struct[i]), np.std(NUMERICAL_STRUCT_ARRAY[i]), atol=1e-1) # ......categorical val_struct, freq_struct = np.unique(samples_struct['a'], return_counts=True) freq_struct = freq_struct / freq_struct.sum() assert np.array_equal(val_struct, NUMERICAL_NP_0_CAT_VAL) assert np.allclose(freq_struct, NUMERICAL_NP_0_CAT_FREQ, atol=1e-1) ####################################################################### ####################################################################### # Pure categorical sampling # ...numpy array samples = self.categorical_np_012_augmentor.sample( CATEGORICAL_NP_ARRAY[0], samples_number=3) assert np.array_equal(samples, CATEGORICAL_NP_RESULTS) # ...structured array samples_struct = self.categorical_struct_abc_augmentor.sample( CATEGORICAL_STRUCT_ARRAY[0], samples_number=3) assert np.array_equal(samples_struct, CATEGORICAL_STRUCT_RESULTS) vals = [['a', 'b'], ['b', 'c', 'f'], ['c', 'g']] # ...numpy array proportions and values samples = self.categorical_np_012_augmentor.sample( CATEGORICAL_NP_ARRAY[0], samples_number=100) # proportions = [ np.array([0.62, 0.38]), np.array([0.31, 0.17, 0.52]), np.array([0.63, 0.37]) ] for i, index in enumerate([0, 1, 2]): val, freq = np.unique(samples[:, index], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, vals[i]) assert np.allclose(freq, proportions[i], atol=1e-2) # ...structured array proportions and values samples_struct = self.categorical_struct_abc_augmentor.sample( CATEGORICAL_STRUCT_ARRAY[0], samples_number=100) # proportions = [ np.array([0.74, 0.26]), np.array([0.38, 0.12, 0.50]), np.array([0.63, 0.37]) ] for i, index in enumerate(['a', 'b', 'c']): val, freq = np.unique(samples_struct[index], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, vals[i]) assert np.allclose(freq, proportions[i], atol=1e-2) # No need to check for mean of dataset since categorical features are # sampled from the distribution of the entire dataset and not centered # on the data_row. ####################################################################### ####################################################################### # Mixed array with categorical indices auto-discovered vals = [['a', 'c', 'f'], ['a', 'aa', 'b', 'bb']] proportions = [ np.array([0.33, 0.33, 0.33]), np.array([0.33, 0.16, 0.16, 0.33]) ] # Instance samples = self.mixed_augmentor.sample(MIXED_ARRAY[0], samples_number=3) # ...categorical assert np.array_equal(samples[['b', 'd']], MIXED_RESULTS[['b', 'd']]) # ...numerical for i in ['a', 'c']: assert np.allclose(samples[i], MIXED_RESULTS[i], atol=1e-3) # Instance mean samples = self.mixed_augmentor.sample(MIXED_ARRAY[0], samples_number=1000) # ...numerical for i in ['a', 'c']: assert np.allclose(np.mean(samples[i]), MIXED_ARRAY[0][i], atol=1e-1) assert np.allclose(np.std(samples[i]), np.std(MIXED_ARRAY[i]), atol=1e-1) # ...categorical for i, index in enumerate(['b', 'd']): val, freq = np.unique(samples[index], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, vals[i]) assert np.allclose(freq, proportions[i], atol=1e-1) # Dataset mean samples = self.mixed_augmentor.sample(samples_number=1000) # ...numerical for i in ['a', 'c']: assert np.allclose(np.mean(samples[i]), np.mean(MIXED_ARRAY[i]), atol=1e-1) assert np.allclose(np.std(samples[i]), np.std(MIXED_ARRAY[i]), atol=1e-1) # ...categorical for i, index in enumerate(['b', 'd']): val, freq = np.unique(samples[index], return_counts=True) freq = freq / freq.sum() assert np.array_equal(val, vals[i]) assert np.allclose(freq, proportions[i], atol=1e-1) ####################################################################### # Sample without float cast samples = self.numerical_struct_augmentor_f.sample(samples_number=5) samples_answer = np.array( [(-1, 0, 0.172, 0.624), (1, 1, 0.343, 0.480), (0, 0, 0.649, 0.374), (0, 0, 0.256, 0.429), (0, 0, 0.457, 0.743)], dtype=NUMERICAL_STRUCT_ARRAY.dtype) # yapf: disable for i in ['a', 'b', 'c', 'd']: assert np.allclose(samples[i], samples_answer[i], atol=1e-3) # Cast to float on in the tests to compare (this ouput was generated # with self.numerical_struct_augmentor) samples = self.numerical_struct_augmentor_f.sample(samples_number=5) samples_answer = np.array( [(1.250, 0.264, 0.381, 0.479), (-0.181, 1.600, 0.602, 0.345), (0.472, 0.609, -0.001, 1.026), (0.105, 1.091, 0.384, 0.263), (1.263, -0.007, 0.762, 0.603)], dtype=NUMERICAL_STRUCT_ARRAY.dtype) # yapf: disable for i in ['a', 'b', 'c', 'd']: assert np.allclose(samples[i], samples_answer[i], atol=1e-3)