Example #1
0
def impute_conf_ax(means, stds, ns, ax=None):
    models = []
    xs = []
    for mean, std, n in zip(means, stds, ns):
        x = np.random.randn(n)*std + mean
        model = copy.deepcopy(MODEL_SCHEMA)
        model['col_suffstats'][0][0]['n'] = n
        model['col_suffstats'][0][0]['sum_x'] = np.sum(x)
        model['col_suffstats'][0][0]['sum_x_sq'] = np.sum(x**2)

        models.append(model)
        xs.append(x)

        sns.distplot(x, hist=False, ax=ax, norm_hist=False, label='model')

    sns.distplot(np.hstack(xs), hist=False, ax=ax, norm_hist=False,
                 kde_kws={'color': 'crimson', 'lw': 3}, label='combined')

    x, conf = mu.impute(0, 0, ROW2IDX*len(models), models, (-20, 10))
    ax.plot([x, x], [0, ax.get_ylim()[1]], lw=2, ls='--', c='black')
    ax.set_title('Confidence = %f' % conf)
Example #2
0
    def impute(self, col, rows=None):
        """ Infer the most likely value and reuturn a measure of the
        confidence.

        The imputed value is simply the most likely value.

        Confidence is a meausure of the agreement between models; not an
        interval. For example, the continuous confidence of row `r` in column
        `c` is calculated like so:

            * M is the set of all component models to which which `r` in `c` is
              assigned.
            * mu is the set of the means on M
            * a -> min(mu), b -> max(mu)
            * conf is one minus the integral of the probability of observing
              a values between `a` and `b`.

        The confidence is higher the narrower the imputation region and is 1 if
        `x`'s component models are have the same mean.

        Notes
        -----
        I'm not sure that categorical confidence behaves intuitively in the
        event that the predictve PMF has multiple maximum-probability values.
        For example if :code:`PMF = [.4, .4, .2]`.

        Parameters
        ----------
        col : column name
            The column name to impute
        rows : list(row name), optional
            A list of the rows to impute. If None (default), all missing values
            will be imputed.

        Returns
        -------
        impdata : pandas.DataFrame
            Row-indexed DataFrame with a columns for the imputed values and
            the confidence (`conf`) in those values.
        """

        if rows is None:
            # Get row indices where col is null
            rows = self._df[pd.isnull(self._df[col])].index

        col_idx = self._converters['col2idx'][col]
        row2idx = self._converters['row2idx_sf']

        # FIXME: In the future we'll want a better way to determine
        # optimization bounds for different dtypes. If statements are gross.
        dtype = self._dtypes[col_idx]
        if dtype == 'continuous':
            lower = np.nanmin(self._data[:, col_idx])
            upper = np.nanmax(self._data[:, col_idx])
            bounds = (
                lower,
                upper,
            )
        elif dtype == 'categorical':
            bounds = self._converters['valmaps'][col]['val2idx'].values()
        else:
            raise ValueError('Unsupported dtype: {}'.format(dtype))

        impdata = []
        for row in rows:
            x, conf = mu.impute(row, col_idx, row2idx, self._models, bounds)
            if dtype == 'categorical':
                x = self._converters['valmaps'][col]['idx2val'][x]
            impdata.append({col: x, 'conf': conf})

        return pd.DataFrame(impdata, index=rows)
Example #3
0
    def impute(self, col, rows=None):
        """ Infer the most likely value and reuturn a measure of the
        confidence.

        The imputed value is simply the most likely value.

        Confidence is a meausure of the agreement between models; not an
        interval. For example, the continuous confidence of row `r` in column
        `c` is calculated like so:

            * M is the set of all component models to which which `r` in `c` is
              assigned.
            * mu is the set of the means on M
            * a -> min(mu), b -> max(mu)
            * conf is one minus the integral of the probability of observing
              a values between `a` and `b`.

        The confidence is higher the narrower the imputation region and is 1 if
        `x`'s component models are have the same mean.

        Notes
        -----
        I'm not sure that categorical confidence behaves intuitively in the
        event that the predictve PMF has multiple maximum-probability values.
        For example if :code:`PMF = [.4, .4, .2]`.

        Parameters
        ----------
        col : column name
            The column name to impute
        rows : list(row name), optional
            A list of the rows to impute. If None (default), all missing values
            will be imputed.

        Returns
        -------
        impdata : pandas.DataFrame
            Row-indexed DataFrame with a columns for the imputed values and
            the confidence (`conf`) in those values.
        """

        if rows is None:
            # Get row indices where col is null
            rows = self._df[pd.isnull(self._df[col])].index

        col_idx = self._converters['col2idx'][col]
        row2idx = self._converters['row2idx_sf']

        # FIXME: In the future we'll want a better way to determine
        # optimization bounds for different dtypes. If statements are gross.
        dtype = self._dtypes[col_idx]
        if dtype == 'continuous':
            lower = np.nanmin(self._data[:, col_idx])
            upper = np.nanmax(self._data[:, col_idx])
            bounds = (lower, upper,)
        elif dtype == 'categorical':
            bounds = self._converters['valmaps'][col]['val2idx'].values()
        else:
            raise ValueError('Unsupported dtype: {}'.format(dtype))

        impdata = []
        for row in rows:
            x, conf = mu.impute(row, col_idx, row2idx, self._models, bounds)
            if dtype == 'categorical':
                x = self._converters['valmaps'][col]['idx2val'][x]
            impdata.append({col: x, 'conf': conf})

        return pd.DataFrame(impdata, index=rows)