Beispiel #1
0
def test_mi_uninformative():
    x = np.reshape(np.arange(1, 101), (-1, 1))
    y = np.ones((100, 1))
    mi = estimate_mutual_information(x, y)
    h_z = estimate_entropy(x)
    assert h_z / 4 > mi, \
        'uninformative column should have no information'
Beispiel #2
0
def test_mi_informative():
    x = np.reshape(np.arange(1, 101), (-1, 1))
    y = np.reshape(np.arange(1, 101), (-1, 1))
    mi = estimate_mutual_information(x, y)
    h_y = estimate_entropy(y)
    assert mi > h_y / 4, \
        'exact copy columns should have high information'
Beispiel #3
0
 def test_mi_informative(self):
     x = np.reshape(np.arange(1, 101), (-1, 1))
     y = np.reshape(np.arange(1, 101), (-1, 1))
     mi = estimate_mutual_information(x, y)
     h_y = estimate_entropy(y)
     self.assertGreater(mi, h_y / 4,
                        'exact copy columns should have high information')
Beispiel #4
0
 def test_mi_uninformative(self):
     x = np.reshape(np.arange(1, 101), (-1, 1))
     y = np.ones((100, 1))
     mi = estimate_mutual_information(x, y)
     h_z = estimate_entropy(x)
     self.assertGreater(h_z / 4, mi,
                        'uninformative column should have no information')
Beispiel #5
0
def test_cmi_high_info_uninformative_z():
    # redundant copies have little information
    x = np.reshape(np.arange(1, 101), (-1, 1))
    y = np.reshape(np.arange(1, 101), (-1, 1))

    # exact copies of y should have lots of information
    useless_z = np.ones((100, 1))
    cmi = estimate_conditional_information(x, y, useless_z)
    mi = estimate_mutual_information(x, y)
    assert round(abs(cmi - mi)) == 0, \
        'uninformative z should not affect mutual information score'
Beispiel #6
0
    def test_cmi_high_info_uninformative_z(self):
        # redundant copies have little information
        x = np.reshape(np.arange(1, 101), (-1, 1))
        y = np.reshape(np.arange(1, 101), (-1, 1))

        # exact copies of y should have lots of information
        useless_z = np.ones((100, 1))
        cmi = estimate_conditional_information(x, y, useless_z)
        mi = estimate_mutual_information(x, y)
        self.assertAlmostEqual(
            cmi, mi,
            'uninformative z should not affect mutual information score')
Beispiel #7
0
 def judge(self):
     logger.info(f'Judging feature using {self}')
     z = (self.candidate_feature.as_feature_engineering_pipeline().fit(
         self.X_df, y=self.y_df).transform(self.X_df_val))
     y = self.y_val
     z, y = asarray2d(z), asarray2d(y)
     z, y = self._handle_nans(z, y)
     if z is None and y is None:
         # nans were found and handle_nan_targets == 'fail'
         return False
     mi = estimate_mutual_information(z, y)
     delta = mi - self.threshold
     outcome = delta > 0
     logger.info(f'Mutual information with target I(Z;Y) is {mi} vs. '
                 f'threshold {self.threshold} ({delta} above threshold)')
     return outcome
Beispiel #8
0
def _summarize_feature(
    feature: 'ballet.feature.Feature',
    values: Optional[Dict['ballet.feature.Feature', Optional[np.ndarray]]],
    y: Optional[np.ndarray],
    expensive_stats: bool,
) -> dict:
    """Summarize a single feature"""
    result = {
        'name':
        feature.name,
        'description':
        feature.description,
        'input': [feature.input] if isinstance(feature.input, str) else
        feature.input if not callable(feature.input) else [],
        'transformer':
        repr(feature.transformer),
        'primitives':
        get_transformer_primitives(feature.transformer),
        'output':
        feature.output,
        'author':
        feature.author,
        'source':
        feature.source,
        'mutual_information':
        np.nan,
        'conditional_mutual_information':
        np.nan,
        'ninputs':
        np.nan,
        'nvalues':
        np.nan,
        'ncontinuous':
        np.nan,
        'ndiscrete':
        np.nan,
        'mean':
        np.nan,
        'std':
        np.nan,
        'variance':
        np.nan,
        'min':
        np.nan,
        'median':
        np.nan,
        'max':
        np.nan,
        'nunique':
        np.nan,
    }

    # if feature values are missing here, the values are left at nans from
    # above
    if values is not None and y is not None:
        z = values[feature]
        if z is not None:
            feature_values_list = [
                feature_values
                for other_feature, feature_values in values.items()
                if other_feature is not feature and feature_values is not None
            ]
            if feature_values_list:
                x = np.concatenate(feature_values_list, axis=1)
            else:
                x = np.empty((z.shape[0], 0))

            _y, _z = skipna(y, z, how='left')
            result['mutual_information'] = estimate_mutual_information(_z, _y)

            if not callable(feature.input):
                if isinstance(feature.input, str):
                    result['ninputs'] = 1
                else:
                    result['ninputs'] = len(feature.input)
            result['nvalues'] = z.shape[1]
            result['ncontinuous'] = np.sum(_get_cont_columns(z))
            result['ndiscrete'] = np.sum(_get_disc_columns(z))
            result['mean'] = np.mean(np.mean(z, axis=0))  # same thing anyway
            result['std'] = np.mean(np.std(z, axis=0))
            result['variance'] = np.mean(np.var(z, axis=0))
            result['min'] = np.min(z)
            result['median'] = np.median(np.median(z, axis=0))
            result['max'] = np.max(z)
            result['nunique'] = np.mean(countunique(z, axis=0))

            if expensive_stats or x.shape[1] < EXPENSIVE_STATS_CMI_MAX_COLS_X:
                _y, _z, _x = skipna(y, z, x, how='left')
                result['conditional_mutual_information'] = \
                    estimate_conditional_information(_z, _y, _x)

    return result
Beispiel #9
0
def I(a, b, c=None):  # noqa
    if c is None:
        return estimate_mutual_information(a, b)
    else:
        return estimate_conditional_information(a, b, c)