def impute_conf_ax(means, stds, ns, ax=None): models = [] xs = [] for mean, std, n in zip(means, stds, ns): x = np.random.randn(n)*std + mean model = copy.deepcopy(MODEL_SCHEMA) model['col_suffstats'][0][0]['n'] = n model['col_suffstats'][0][0]['sum_x'] = np.sum(x) model['col_suffstats'][0][0]['sum_x_sq'] = np.sum(x**2) models.append(model) xs.append(x) sns.distplot(x, hist=False, ax=ax, norm_hist=False, label='model') sns.distplot(np.hstack(xs), hist=False, ax=ax, norm_hist=False, kde_kws={'color': 'crimson', 'lw': 3}, label='combined') x, conf = mu.impute(0, 0, ROW2IDX*len(models), models, (-20, 10)) ax.plot([x, x], [0, ax.get_ylim()[1]], lw=2, ls='--', c='black') ax.set_title('Confidence = %f' % conf)
def impute(self, col, rows=None): """ Infer the most likely value and reuturn a measure of the confidence. The imputed value is simply the most likely value. Confidence is a meausure of the agreement between models; not an interval. For example, the continuous confidence of row `r` in column `c` is calculated like so: * M is the set of all component models to which which `r` in `c` is assigned. * mu is the set of the means on M * a -> min(mu), b -> max(mu) * conf is one minus the integral of the probability of observing a values between `a` and `b`. The confidence is higher the narrower the imputation region and is 1 if `x`'s component models are have the same mean. Notes ----- I'm not sure that categorical confidence behaves intuitively in the event that the predictve PMF has multiple maximum-probability values. For example if :code:`PMF = [.4, .4, .2]`. Parameters ---------- col : column name The column name to impute rows : list(row name), optional A list of the rows to impute. If None (default), all missing values will be imputed. Returns ------- impdata : pandas.DataFrame Row-indexed DataFrame with a columns for the imputed values and the confidence (`conf`) in those values. """ if rows is None: # Get row indices where col is null rows = self._df[pd.isnull(self._df[col])].index col_idx = self._converters['col2idx'][col] row2idx = self._converters['row2idx_sf'] # FIXME: In the future we'll want a better way to determine # optimization bounds for different dtypes. If statements are gross. dtype = self._dtypes[col_idx] if dtype == 'continuous': lower = np.nanmin(self._data[:, col_idx]) upper = np.nanmax(self._data[:, col_idx]) bounds = ( lower, upper, ) elif dtype == 'categorical': bounds = self._converters['valmaps'][col]['val2idx'].values() else: raise ValueError('Unsupported dtype: {}'.format(dtype)) impdata = [] for row in rows: x, conf = mu.impute(row, col_idx, row2idx, self._models, bounds) if dtype == 'categorical': x = self._converters['valmaps'][col]['idx2val'][x] impdata.append({col: x, 'conf': conf}) return pd.DataFrame(impdata, index=rows)
def impute(self, col, rows=None): """ Infer the most likely value and reuturn a measure of the confidence. The imputed value is simply the most likely value. Confidence is a meausure of the agreement between models; not an interval. For example, the continuous confidence of row `r` in column `c` is calculated like so: * M is the set of all component models to which which `r` in `c` is assigned. * mu is the set of the means on M * a -> min(mu), b -> max(mu) * conf is one minus the integral of the probability of observing a values between `a` and `b`. The confidence is higher the narrower the imputation region and is 1 if `x`'s component models are have the same mean. Notes ----- I'm not sure that categorical confidence behaves intuitively in the event that the predictve PMF has multiple maximum-probability values. For example if :code:`PMF = [.4, .4, .2]`. Parameters ---------- col : column name The column name to impute rows : list(row name), optional A list of the rows to impute. If None (default), all missing values will be imputed. Returns ------- impdata : pandas.DataFrame Row-indexed DataFrame with a columns for the imputed values and the confidence (`conf`) in those values. """ if rows is None: # Get row indices where col is null rows = self._df[pd.isnull(self._df[col])].index col_idx = self._converters['col2idx'][col] row2idx = self._converters['row2idx_sf'] # FIXME: In the future we'll want a better way to determine # optimization bounds for different dtypes. If statements are gross. dtype = self._dtypes[col_idx] if dtype == 'continuous': lower = np.nanmin(self._data[:, col_idx]) upper = np.nanmax(self._data[:, col_idx]) bounds = (lower, upper,) elif dtype == 'categorical': bounds = self._converters['valmaps'][col]['val2idx'].values() else: raise ValueError('Unsupported dtype: {}'.format(dtype)) impdata = [] for row in rows: x, conf = mu.impute(row, col_idx, row2idx, self._models, bounds) if dtype == 'categorical': x = self._converters['valmaps'][col]['idx2val'][x] impdata.append({col: x, 'conf': conf}) return pd.DataFrame(impdata, index=rows)