def test_mi_uninformative(): x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.ones((100, 1)) mi = estimate_mutual_information(x, y) h_z = estimate_entropy(x) assert h_z / 4 > mi, \ 'uninformative column should have no information'
def test_mi_informative(): x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.reshape(np.arange(1, 101), (-1, 1)) mi = estimate_mutual_information(x, y) h_y = estimate_entropy(y) assert mi > h_y / 4, \ 'exact copy columns should have high information'
def test_mi_informative(self): x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.reshape(np.arange(1, 101), (-1, 1)) mi = estimate_mutual_information(x, y) h_y = estimate_entropy(y) self.assertGreater(mi, h_y / 4, 'exact copy columns should have high information')
def test_mi_uninformative(self): x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.ones((100, 1)) mi = estimate_mutual_information(x, y) h_z = estimate_entropy(x) self.assertGreater(h_z / 4, mi, 'uninformative column should have no information')
def test_cmi_high_info_uninformative_z(): # redundant copies have little information x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.reshape(np.arange(1, 101), (-1, 1)) # exact copies of y should have lots of information useless_z = np.ones((100, 1)) cmi = estimate_conditional_information(x, y, useless_z) mi = estimate_mutual_information(x, y) assert round(abs(cmi - mi)) == 0, \ 'uninformative z should not affect mutual information score'
def test_cmi_high_info_uninformative_z(self): # redundant copies have little information x = np.reshape(np.arange(1, 101), (-1, 1)) y = np.reshape(np.arange(1, 101), (-1, 1)) # exact copies of y should have lots of information useless_z = np.ones((100, 1)) cmi = estimate_conditional_information(x, y, useless_z) mi = estimate_mutual_information(x, y) self.assertAlmostEqual( cmi, mi, 'uninformative z should not affect mutual information score')
def judge(self): logger.info(f'Judging feature using {self}') z = (self.candidate_feature.as_feature_engineering_pipeline().fit( self.X_df, y=self.y_df).transform(self.X_df_val)) y = self.y_val z, y = asarray2d(z), asarray2d(y) z, y = self._handle_nans(z, y) if z is None and y is None: # nans were found and handle_nan_targets == 'fail' return False mi = estimate_mutual_information(z, y) delta = mi - self.threshold outcome = delta > 0 logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. ' f'threshold {self.threshold} ({delta} above threshold)') return outcome
def _summarize_feature( feature: 'ballet.feature.Feature', values: Optional[Dict['ballet.feature.Feature', Optional[np.ndarray]]], y: Optional[np.ndarray], expensive_stats: bool, ) -> dict: """Summarize a single feature""" result = { 'name': feature.name, 'description': feature.description, 'input': [feature.input] if isinstance(feature.input, str) else feature.input if not callable(feature.input) else [], 'transformer': repr(feature.transformer), 'primitives': get_transformer_primitives(feature.transformer), 'output': feature.output, 'author': feature.author, 'source': feature.source, 'mutual_information': np.nan, 'conditional_mutual_information': np.nan, 'ninputs': np.nan, 'nvalues': np.nan, 'ncontinuous': np.nan, 'ndiscrete': np.nan, 'mean': np.nan, 'std': np.nan, 'variance': np.nan, 'min': np.nan, 'median': np.nan, 'max': np.nan, 'nunique': np.nan, } # if feature values are missing here, the values are left at nans from # above if values is not None and y is not None: z = values[feature] if z is not None: feature_values_list = [ feature_values for other_feature, feature_values in values.items() if other_feature is not feature and feature_values is not None ] if feature_values_list: x = np.concatenate(feature_values_list, axis=1) else: x = np.empty((z.shape[0], 0)) _y, _z = skipna(y, z, how='left') result['mutual_information'] = estimate_mutual_information(_z, _y) if not callable(feature.input): if isinstance(feature.input, str): result['ninputs'] = 1 else: result['ninputs'] = len(feature.input) result['nvalues'] = z.shape[1] result['ncontinuous'] = np.sum(_get_cont_columns(z)) result['ndiscrete'] = np.sum(_get_disc_columns(z)) result['mean'] = np.mean(np.mean(z, axis=0)) # same thing anyway result['std'] = np.mean(np.std(z, axis=0)) result['variance'] = np.mean(np.var(z, axis=0)) result['min'] = np.min(z) result['median'] = np.median(np.median(z, axis=0)) result['max'] = np.max(z) result['nunique'] = np.mean(countunique(z, axis=0)) if expensive_stats or x.shape[1] < EXPENSIVE_STATS_CMI_MAX_COLS_X: _y, _z, _x = skipna(y, z, x, how='left') result['conditional_mutual_information'] = \ estimate_conditional_information(_z, _y, _x) return result
def I(a, b, c=None): # noqa if c is None: return estimate_mutual_information(a, b) else: return estimate_conditional_information(a, b, c)