def mixed_data_features(df, add_nans=False): """Extracts the numerical/categorical feature parameters from a dataset. Parameters ---------- df : pandas DataFrame The dataset with all the features to analyze (both numerical and categorical). add_nans : bool If True the sampler adds a "NaNs" category for the categorical features that have any null values and assigns it the appropriate fraction. Returns ------- dict of dicts A dictionary with an entry per dataset feature (dictionary keys are the column names), where each numerical feature entry contains a nested dictionary with the values of the minimum and maximum values of the dynamic range of the dataset, as well as the mean and sigma of the distribution, and each categorical feature entry contains a nested dictionary with its categories and the fraction of each category present in the analyzed dataset (nested dictionary keys are "min", "max", "mean", "sigma", and "categories", which is also a dictionary with one entry per category). """ features_dict = {} for feature in df: df_feature = df[[feature]] if is_discrete(df[feature]): single_feature_parameters = find_categories(df_feature, add_nans=add_nans) else: single_feature_parameters = dynamical_range(df_feature) features_dict[feature] = single_feature_parameters[feature] return features_dict
def compute_conditional_metric( grouping_col, true_labs, pred_labs, metric, as_categorical=False, num_bins=10, quantile=False, ): """Compute metric values conditional on the grouping column. The metric is computed within unique values of the grouping column (categorical) or within bins partitioning its range (continuous). Parameters ---------- grouping_col : Series Series defining a grouping for the metric computation. true_labs : Series Series of true labels for a test dataset. pred_labs : Series Series of labels predicted by a model for a test dataset. metric : function The evaluation metric to compute across the groupings. This should be a function f(y_true, y_pred) which accepts Series of true and predicted labels. as_categorical : bool Should the grouping column be treated as categorical, ie. binned on its unique values? If it is not numeric, this param is ignored. num_bins : int Number of bins to use for grouping a numeric column. quantile : bool Should the bin widths correspond to quantiles of a numerical column's distribution (`True`) or be equally-spaced over its range (`False`). Returns ------- ConditionalMetricResult """ y_vals = DataFrame({"y_true": true_labs, "y_pred": pred_labs}) if is_discrete(grouping_col): as_categorical = True if as_categorical: grouping = grouping_col bins = grouping.unique() else: grouping, bins = get_bins(grouping_col, num_bins, quantile) binned_metric_vals = y_vals.groupby(grouping).apply( lambda gp: metric(gp["y_true"], gp["y_pred"])) return ConditionalMetricResult( vals=binned_metric_vals, bins=Series(bins), categorical=as_categorical, num_bins=num_bins, quantile=quantile, )
def find_categories(df, add_nans=False): """Returns the categories of the dataset features. Parameters ---------- df : pandas DataFrame The dataset with all the categorical features to analyze. add_nans : bool If True the sampler adds a "NaNs" category for the features that have any null values and assigns it the appropriate fraction. Returns ------- dict of dicts A dictionary with an entry per dataset feature (dictionary keys are the column names), where each feature entry contains a nested dictionary with its categories and the fraction of each category present in the analyzed dataset (the nested dictionary key for this information is "categories", which is also a dictionary with one entry per category). """ categories_dict = {} for feature in df: if is_discrete(df[feature]): # Remove NaN values from selection df_no_nans = df[df[feature].notnull()] # Log fraction of NaN values if required if add_nans: nan_fraction = df[feature].isnull().sum() / len(df) total_length = len(df) else: nan_fraction = 0 total_length = len(df_no_nans) categories_dict[feature] = { "categories": { key: None for key in df_no_nans[feature].unique().tolist() } } for category in categories_dict[feature]["categories"].keys(): categories_dict[feature]["categories"][category] = ( df_no_nans[feature].value_counts()[category] / total_length ) if add_nans and nan_fraction != 0: categories_dict[feature]["categories"]["NaNs"] = nan_fraction return categories_dict
def compute(self, **kwargs): """Compute the evaluation for the given datasets. Parameters ---------- kwargs: On-the-fly overrides to the config option values for the computation. Returns ------- SpatialDistributionResult """ eval_config = PrescConfig(self._config) eval_config = eval_config["evaluations"]["spatial_distribution"] if kwargs: eval_config.set(kwargs) # Feature columns to include in the distance computation. feats_incl = eval_config["features_include"].get() feats_excl = eval_config["features_exclude"].get() feats = include_exclude_list( self._test_dataset.feature_names, included=feats_incl, excluded=feats_excl ) num_feats = [] categ_feats = [] for col in feats: if is_discrete(self._test_dataset.features[col]): categ_feats.append(col) else: num_feats.append(col) # Figure the metric to use for each feature. dist_metrics_num, dist_metrics_categ = _get_distance_metrics_by_column( num_feats, categ_feats, eval_config ) return compute_spatial_distribution( test_features=self._test_dataset.features, test_labs_true=self._test_dataset.labels, test_labs_pred=self._test_pred, base_features=self._train_dataset.features, base_labs=self._train_dataset.labels, numerical_dist_metric=dist_metrics_num, categorical_dist_metric=dist_metrics_categ, summary=eval_config["summary_agg"].get(), )
def compute_conditional_distribution( data_col, true_labs, pred_labs, as_categorical=False, binning="fd", common_bins=True ): """Compute a distributional summary. The metric is computed within unique values of the grouping column (categorical) or within bins partitioning its range (continuous). Parameters ---------- data_col : A column of data from a test dataset. true_labs : Series A series of true labels for the test dataset. pred_labs : Series A series of labels predicted by a model for the test dataset. as_categorical : bool Should the data column be treated as categorical, ie. binned on its unique values? If it is not numeric, this param is ignored. binning : str Binning scheme to use for a numerical column, passed to `numpy.histogram`. Can be a fixed number of bins or a string indicating a binning scheme common_bins : bool Should the bins be computed over the entire column and shared across groups (`True`) or computed within each group (`False`) Returns ------- ConditionalDistributionResult """ grouping = [true_labs, pred_labs] if is_discrete(data_col): as_categorical = True if as_categorical: grouping.append(data_col) distribs = data_col.groupby(grouping).size() if common_bins: # Extend the index in each label group to include all data values data_vals = distribs.index.get_level_values(-1).unique() y_vals = distribs.index.droplevel(-1).unique() full_ind = MultiIndex.from_tuples( [(yt, yp, x) for yt, yp in y_vals.values for x in data_vals], names=distribs.index.names, ) distribs = distribs.reindex(index=full_ind, fill_value=0) bin_edges = Series(data_vals) else: # Convert the innermost index level to a Series of bin edges. bin_edges = distribs.rename(None).reset_index(level=-1).iloc[:, 0] else: if common_bins: bins = histogram_bin_edges(data_col, bins=binning) else: bins = binning # distribs will be a series with values (<hist_values>, <bin_edges>) distribs = data_col.groupby(grouping).apply(lambda x: histogram(x, bins=bins)) bin_edges = distribs.map(lambda x: x[1]) bin_ind_tuples = [] for y in distribs.index: bin_ind_tuples.extend( [(y[0], y[1], x) for x in _histogram_bin_labels(bin_edges.loc[y])] ) index_with_bins = MultiIndex.from_tuples( bin_ind_tuples, names=distribs.index.names + [None] ) distribs = Series( distribs.map(lambda x: x[0]).explode().values, index=index_with_bins ) if common_bins: # Retain the unique bin edges as an array bin_edges = Series(bin_edges.iloc[0]) return ConditionalDistributionResult( vals=distribs, bins=Series(bin_edges), categorical=as_categorical, binning=binning, common_bins=common_bins, )