def _estimate_entropy(x: np.ndarray, epsilon: np.ndarray) -> float: """Estimate dataset entropy.""" x = asarray2d(x) n, d = x.shape # not enough data if n <= 1 or d == 0: return 0 disc_mask = _get_disc_columns(x) cont_mask = ~disc_mask # if all columns are disc, use discrete-specific estimator if np.all(disc_mask): return _estimate_disc_entropy(x) # if all columns are cont, use continuous-specific estimator if np.all(cont_mask): return _estimate_cont_entropy(x, epsilon) # Separate the dataset into discrete and continuous datasets disc and cont disc = asarray2d(x[:, disc_mask]) cont = asarray2d(x[:, cont_mask]) # H(c|d) H_c_d = _estimate_conditional_entropy(cont, disc, epsilon) # H(d) H_d = _estimate_disc_entropy(disc) return H_d + H_c_d
def test_asarray2d_series(): # case: pd.Series a = np.zeros((3, )) ser = pd.Series(a) result = asarray2d(ser) assert result.shape[1] >= 1 assert_array_equal(result, asarray2d(a))
def _estimate_disc_entropy(x: np.ndarray) -> float: r"""Estimate the Shannon entropy of a discrete dataset. The Shannon entropy of a discrete random variable :math:`Z` with support :math:`\mathbb{Z}` and density :math:`P_Z` is given as .. math:: H(Z) = -\sum_{z \in \mathbb{Z}} P_Z(z) \log(P_Z(z)) Here, since we do not know :math:`P_Z`, we estimate :math:`\hat{P}_Z`, the empirical probability, calculated as the frequency in the dataset x. If x's columns logically represent continuous features, it is better to use the `_estimate_cont_entropy` function. If you are unsure of which to use, `estimate_entropy` can take datasets of mixed discrete and continuous functions. Args: x: Dataset with shape (n_samples, n_features) or (n_samples, ) Returns: the dataset entropy. """ x = asarray2d(x) pk, _ = _compute_empirical_probability(x) return scipy.stats.entropy(pk)
def test_asarray2d_shape_n(): # case: second dimension not present a = np.zeros((3, )) result = asarray2d(a) expected_shape = (3, 1) assert result.shape == expected_shape assert_array_equal(np.ravel(result), a)
def judge(self): logger.info(f'Judging feature using {self}') z = (self.candidate_feature.as_feature_engineering_pipeline().fit( self.X_df, y=self.y_df).transform(self.X_df_val)) y = self.y_val z, y = asarray2d(z), asarray2d(y) z, y = self._handle_nans(z, y) if z is None and y is None: # nans were found and handle_nan_targets == 'fail' return False mi = estimate_mutual_information(z, y) delta = mi - self.threshold outcome = delta > 0 logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. ' f'threshold {self.threshold} ({delta} above threshold)') return outcome
def test_asarray2d_df(): # case: pd.DataFrame a = np.zeros((3, 2)) df = pd.DataFrame(a) result = asarray2d(df) assert result.shape == df.shape assert result.shape[1] >= 1 assert_array_equal(result, a)
def _test_robust_transformer(self, input_types, bad_input_checks, catches, transformer_maker=FragileTransformer): fragile_transformer = transformer_maker(bad_input_checks, catches) robust_transformer = DelegatingRobustTransformer( transformer_maker(bad_input_checks, catches)) for input_type in input_types: X, y = self.d[input_type] # fragile transformer raises error with self.assertRaises(catches): fragile_transformer.fit_transform(X, y) # robust transformer does not raise error X_robust = robust_transformer.fit_transform(X, y) self.assertTrue(np.array_equal(asarray2d(X), asarray2d(X_robust)))
def _concat_datasets(dfs_by_src, n_samples=0, omit=None): if omit is None: omit = [] filtered_dfs = [ np.array(dfs_by_src[x]) for x in dfs_by_src if x not in omit ] if len(filtered_dfs) == 0: return np.zeros((n_samples, 1)) return asarray2d(np.concatenate(filtered_dfs, axis=1))
def __init__(self, *args, lmbda_1=0., lmbda_2=0.): super().__init__(*args) self.y = asarray2d(self.y) if lmbda_1 <= 0: lmbda_1 = estimate_entropy(self.y) / LAMBDA_1_ADJUSTMENT if lmbda_2 <= 0: lmbda_2 = estimate_entropy(self.y) / LAMBDA_2_ADJUSTMENT self.lmbda_1 = lmbda_1 self.lmbda_2 = lmbda_2
def test_cont_disc_entropy_differs_cont(): """Expect cont, disc columns to have different entropy""" cont = asarray2d(np.arange(50)) + 0.5 epsilon = _compute_epsilon(cont) H_cont = _estimate_cont_entropy(cont, epsilon) H_disc = _estimate_disc_entropy(cont) assert H_cont != H_disc
def test_robust_transformer( input_types, bad_input_checks, catches, transformer_maker, sample_data, ): fragile_transformer = transformer_maker(bad_input_checks, catches) robust_transformer = DelegatingRobustTransformer( transformer_maker(bad_input_checks, catches)) for input_type in input_types: X, y = sample_data[input_type] # fragile transformer raises error with pytest.raises(catches): fragile_transformer.fit_transform(X, y) # robust transformer does not raise error X_robust = robust_transformer.fit_transform(X, y) assert np.array_equal(asarray2d(X), asarray2d(X_robust))
def test_entropy_multiple_disc(): same_val_arr_zero = np.zeros((50, 1)) same_val_arr_ones = np.ones((50, 1)) # The 0.5 forces float => classified as continuous cont_val_arange = asarray2d(np.arange(50) + 0.5) all_disc_arr = np.concatenate((same_val_arr_ones, same_val_arr_zero), axis=1) mixed_val_arr = np.concatenate((all_disc_arr, cont_val_arange), axis=1) all_disc_h = estimate_entropy(all_disc_arr) mixed_h = estimate_entropy(mixed_val_arr) assert mixed_h > all_disc_h, \ 'Expected adding continuous column increases entropy'
def test_cont_disc_entropy_differs_disc(get_disc_columns): """Expect cont, disc columns to have different entropy""" disc = asarray2d(np.arange(50)) # we run into trouble here because as disc as *actually* discrete, # epsilon would not be calculated (it is set to some dummy value of # -inf). instead, we patch get_disc_columns and "force" epsilon to be # calculated epsilon = _compute_epsilon(disc) H_cont = _estimate_cont_entropy(disc, epsilon) H_disc = _estimate_disc_entropy(disc) assert H_cont != H_disc
def __init__(self, *args, lmbda_1: float = 0.0, lmbda_2: float = 0.0, lambda_1_adjustment: float = LAMBDA_1_ADJUSTMENT, lambda_2_adjustment: float = LAMBDA_2_ADJUSTMENT): super().__init__(*args) self.y_val = asarray2d(self.y_val) if lmbda_1 <= 0: lmbda_1 = estimate_entropy(self.y_val) / lambda_1_adjustment if lmbda_2 <= 0: lmbda_2 = estimate_entropy(self.y_val) / lambda_2_adjustment self.lmbda_1 = lmbda_1 self.lmbda_2 = lmbda_2
def _concat_datasets(feature_df_map: Dict[Feature, pd.DataFrame], n_samples: int = 0, omit: Optional[List[Feature]] = None) -> np.ndarray: if omit is None: omit = [] filtered_dfs = [ np.array(feature_df_map[feature]) for feature in feature_df_map if feature not in omit ] if not filtered_dfs: return np.zeros((n_samples, 1)) return asarray2d(np.concatenate(filtered_dfs, axis=1))
def _compute_empirical_probability(x): """Compute empirical probability of events in x Args: x: array-like Returns: pk: array-like of shape (K,) where where p[k] is the probability of event k events: array-like of shape (K, m) where each event is a vector of length m and there are K unique events """ x = asarray2d(x) n, _ = x.shape events, counts = np.unique(x, axis=0, return_counts=True) pk = counts * 1.0 / n return pk, events
def estimate_entropy(x): r"""Estimate dataset entropy. This function can take datasets of mixed discrete and continuous features, and uses a set of heuristics to determine which functions to apply to each. Discrete (Shannon) entropy is estimated via the empirical probability mass function. Continuous (differential) entropy is estimated via the KSG estimator [1]. Let x be made of continuous features c and discrete features d. To deal with both continuous and discrete features, We use the following reworking of entropy: .. math:: :nowrap: \begin{align} H(x) &= H(c,d) \\ &= H(d) + H(c | d) \\ &= \sum_{x \in d} p(x) H(c(x)) + H(d), \end{align} where :math:`c(x)` is a dataset that represents the rows of the continuous dataset in the same row as a discrete column with value x in the original dataset. Args: x (array-like): Dataset with shape (n_samples, n_features) or (n_samples, ) epsilon (array-like): An array with shape (n_samples, 1) that is the epsilon used in KSG Estimator. Represents the chebyshev distance from an element to its k-th nearest neighbor in the full dataset. Returns: float: Dataset entropy of X. References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ x = asarray2d(x) epsilon = _compute_epsilon(x) return _estimate_entropy(x, epsilon)
def _compute_epsilon(x: np.ndarray) -> np.ndarray: """Calculate epsilon from KSG Estimator Represents twice the distance of each element to its k-th nearest neighbor. Args: x: An array with shape (n_samples, n_features) Returns: An array with shape (n_samples, 1) representing epsilon as described above. References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ k = N_NEIGHBORS n = x.shape[0] disc_mask = _get_disc_columns(x) if np.all(disc_mask): # if no continuous columns, there's no point getting epsilon return np.full((n, 1), -np.inf) c = x[:, ~disc_mask] nn = _make_neighbors(n_neighbors=k) nn.fit(c) distances = np.zeros(n) # if the kth neighbor is at distance 0, then we are in trouble # but we can try the trick of increasing k if we don't use the old # value of k sometime later # # we aim to make this safer by deciding that columns with many repeated # values are discrete, not continuous (see _is_disc_column). we could also # add a small amount of noise to the whole column, or try something else # entirely. while not np.all(distances) and k < n: # distances to k-nearest neighbor distances = nn.kneighbors(n_neighbors=k)[0][:, -1] k += 1 return asarray2d(2. * distances)
def _estimate_cont_entropy(x: np.ndarray, epsilon: np.ndarray) -> float: """Estimate the differential entropy of a continuous dataset. Based off the KSG Estimator [1] for a dataset's differential entropy. If epsilon is provided, this is a partial estimation of the KSG entropy estimator. The bias is cancelled out when computing mutual information. The function relies on nonparametric methods based on entropy estimation from k-nearest neighbors distances as proposed in [1] and augmented in [2] for mutual information estimation. If X's columns logically represent discrete features, it is better to use the _estimate_disc_entropy function. If you are unsure of which to use, _estimate_entropy can take datasets of mixed discrete and continuous functions. Observe that differential entropy is *not* the "extension" of the Shannon entropy and thus it does not exhibit some properties like non-negativity (i.e. values below zero are possible). Args: x: Dataset with shape (n_samples, n_features) or (n_samples, ) epsilon: An array with shape (n_samples, 1) that is the epsilon used in KSG Estimator. Represents the Chebyshev distance from an element to its k-th nearest neighbor in the full dataset. Returns: differential entropy of the dataset References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ x = asarray2d(x) n, d = x.shape nx = _compute_n_points_within_radius(x, epsilon / 2.0) c_d = _compute_volume_unit_ball(d) return -np.mean(digamma(nx + 1)) + digamma(n) + np.log(c_d) \ + d * np.mean(np.log(epsilon))
def _compute_epsilon(x): """Calculate epsilon from KSG Estimator Represents twice the distance of each element to its k-th nearest neighbor. Args: x (array-like): An array with shape (n_samples, n_features) Returns: array-like: An array with shape (n_samples, 1) representing epsilon as described above. References: .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ k = N_NEIGHBORS n = x.shape[0] disc_mask = _get_disc_columns(x) if np.all(disc_mask): # if no continuous columns, there's no point getting epsilon return -np.inf c = x[:, ~disc_mask] nn = _make_neighbors(n_neighbors=k) nn.fit(c) distances = np.zeros(n) # if the kth neighbor is at distance 0, then we are in trouble # but we can try the trick of increasing k if we don't use the old # value of k sometime later while not np.all(distances) and k < n: distances, _ = nn.kneighbors(n_neighbors=k) distances = distances[:, -1] # distances to k-nearest neighbor k += 1 return asarray2d(2. * distances)
def get_feature_values(feature): return asarray2d( feature.as_feature_engineering_pipeline().fit_transform( X_df, y_df))
def discover( features: List['ballet.feature.Feature'], X_df: Optional[pd.DataFrame], y_df: Optional[pd.DataFrame], y: Optional[np.ndarray], input: Optional[str] = None, primitive: Optional[str] = None, expensive_stats: bool = False, ) -> pd.DataFrame: """Discover existing features Display information about existing features including summary statistics on the development dataset. If the feature extracts multiple feature values, then the summary statistics (e.g. mean, std, nunique) are computed for each feature value and then averaged. If the development dataset cannot be loaded, computation of summary statistics is skipped. The following information is shown: - name: the name of the feature - description: the description of the feature - input: the variables that are used as input to the feature - transformer: the transformer/transformer pipeline - output: the output columns of the feature (not usually specified) - author: the GitHub username of the feature's author - source: the fully-qualified name of the Python module that contains the feature - mutual_information: estimated mutual information between the feature (or averaged over feature values) and the target on the development dataset split - conditional_mutual_information: estimated conditional mutual information between the feature (or averaged over feature values) and the target conditional on all other features on the development dataset split - ninputs: the number of input columns to the feature - nvalues: the number of feature values this feature extracts (i.e. 1 for a scalar-valued feature and >1 for a vector-valued feature) - ncontinuous: the number of feature values this feature extracts that are continuous-valued - ndiscrete: the number of feature values this feature extracts that are discrete-valued - mean: mean of the feature on the development dataset split - std: standard deviation of the feature (or averaged over feature values) on the development dataset split - var: variance of the feature (or averaged over feature values) on the development dataset split - min: minimum of the feature on the development dataset split - median: median of the feature (or median over feature values) on the development dataset split - max: maximum of the feature on the development dataset split - nunique: number of unique values of the feature (or averaged over feature values) on the development dataset split The following query operators are supported: - input (str): filter to only features that have ``input`` in their input/ list of inputs - primitive (str): filter to only features that use primitive ``primitive`` (i.e. a class with name ``primitive``) in the transformer/transformer pipeline For other queries, you should just use normal DataFrame indexing:: >>> features_df[features_df['author'] == 'jane'] >>> features_df[features_df['name'].str.contains('married')] >>> features_df[features_df['mutual_information'] > 0.05] >>> features_df[features_df['input'].apply( lambda input: 'A' in input and 'B' in input)] Returns: data frame with features on the row index and columns as described above """ records = [] if X_df is not None and y_df is not None and y is not None: @fy.ignore(Exception) def get_feature_values(feature): return asarray2d( feature.as_feature_engineering_pipeline().fit_transform( X_df, y_df)) values = {feature: get_feature_values(feature) for feature in features} y = asarray2d(y) summarize = fy.rpartial(_summarize_feature, values, y, expensive_stats) else: summarize = fy.rpartial(_summarize_feature, None, None, expensive_stats) for feature in tqdm(features): if (input and isinstance(feature.input, Container) # avoid callables and input not in feature.input and input != feature.input): continue if (primitive and primitive not in get_transformer_primitives( feature.transformer)): continue summary = summarize(feature) records.append(summary) return pd.DataFrame.from_records(records)
def H(a): # noqa return estimate_entropy(asarray2d(a))
def test_is_column_cont(x, expected): x = asarray2d(x) result = _is_column_cont(x) assert result == expected
def add_noise(X): X = asarray2d(X) return X + np.random.normal(0, 0.5, X.shape)
def test_is_column_disc(self): x = asarray2d(np.arange(50)) result = _is_column_disc(x) self.assertTrue(result)
def test_is_column_cont(self): x = asarray2d(np.random.rand(50)) result = _is_column_cont(x) self.assertTrue(result)
def test_asarray2d_shape_n_x_1(): # case: second dimension == 1 a = np.zeros((3, 1)) result = asarray2d(a) assert_array_equal(result, a)