class ParentChildDetectionMetric( DetectionMetric, metaclass=NestedAttrsMeta('single_table_metric')): """Base class for Multi-table Detection metrics based on parent-child relationships. These metrics denormalize the parent-child relationships from the dataset and then apply a Single Table Detection metric on the resulting tables. The output of the metric is one minus the average ROC AUC score obtained. A part from the real and synthetic data, these metrics need to be passed a list with the foreign key relationships that exist between the tables. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. single_table_metric (sdmetrics.single_table.detection.base.DetectionMetric): The single table detection metric to use. """ single_table_metric = None @staticmethod def _extract_foreign_keys(metadata): if not isinstance(metadata, dict): metadata = metadata.to_dict() foreign_keys = [] for child_table, child_meta in metadata['tables'].items(): for child_key, field_meta in child_meta['fields'].items(): ref = field_meta.get('ref') if ref: foreign_keys.append( (ref['table'], ref['field'], child_table, child_key)) return foreign_keys @staticmethod def _denormalize(data, foreign_key): """Denormalize the child table over the parent.""" parent_table, parent_key, child_table, child_key = foreign_key flat = data[parent_table].merge(data[child_table], how='outer', left_on=parent_key, right_on=child_key) del flat[parent_key] if child_key != parent_key: del flat[child_key] return flat @classmethod def compute(cls, real_data, synthetic_data, metadata=None, foreign_keys=None): """Compute this metric. This denormalizes the parent-child relationships from the dataset and then applies a Single Table Detection metric on the resulting tables. The output of the metric is one minus the average ROC AUC score obtained. A part from the real and synthetic data, either a ``foreign_keys`` list containing the relationships between the tables or a ``metadata`` that can be used to create such list must be passed. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, foreign keys must be passed. foreign_keys (list[tuple[str, str, str, str]]): List of foreign key relationships specified as tuples that contain (parent_table, parent_key, child_table, child_key). Ignored if metada is given. Returns: float: Average of the scores obtained by the single table metric. """ if metadata: foreign_keys = cls._extract_foreign_keys(metadata) if not foreign_keys: raise ValueError('No foreign keys given') scores = [] for foreign_key in foreign_keys: real = cls._denormalize(real_data, foreign_key) synth = cls._denormalize(synthetic_data, foreign_key) scores.append(cls.single_table_metric.compute(real, synth)) return np.mean(scores)
class MultiColumnPairsMetric(SingleTableMetric, metaclass=NestedAttrsMeta('column_pairs_metric')): """SingleTableMetric subclass that applies a ColumnPairsMetric on each possible column pair. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric): ColumnPairsMetric to apply. field_types (dict): Field types to which the SingleColumn metric will be applied. """ column_pairs_metric = None column_pairs_metric_kwargs = None field_types = None def __init__(self, column_pairs_metric, **column_pairs_metric_kwargs): self.column_pairs_metric = column_pairs_metric self.column_pairs_metric_kwargs = column_pairs_metric_kwargs self.compute = self._compute def _compute(self, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This is done by grouping all the columns that are compatible with the underlying ColumnPairs metric in groups of 2 and then evaluating them using the ColumnPairs metric. The output is the average of the scores obtained. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the column pairs metric Returns: Union[float, tuple[float]]: Metric output. """ metadata = self._validate_inputs(real_data, synthetic_data, metadata) fields = self._select_fields(metadata, self.field_types) values = [] for columns in combinations(fields, r=2): real = real_data[list(columns)] synthetic = synthetic_data[list(columns)] values.append(self.column_pairs_metric.compute(real, synthetic)) return np.nanmean(values) @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. Args: real_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the real dataset. synthetic_data (Union[numpy.ndarray, pandas.DataFrame]): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the column pairs metric Returns: Union[float, tuple[float]]: Metric output. """ return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs) @classmethod def normalize(cls, raw_score): """Returns the `raw_score` as is, since it is already normalized. Args: raw_score (float): The value of the metric from `compute`. Returns: float: The normalized value of the metric """ assert cls.min_value == 0.0 return super().normalize(raw_score)
class MultiSingleColumnMetric(SingleTableMetric, metaclass=NestedAttrsMeta('single_column_metric') ): """SingleTableMetric subclass that applies a SingleColumnMetric on each column. This class can either be used by creating a subclass that inherits from it and sets the SingleColumn Metric as the ``single_column_metric`` attribute, or by creating an instance of this class passing the underlying SingleColumn metric as an argument. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. single_column_metric (sdmetrics.single_column.base.SingleColumnMetric): SingleColumn metric to apply. field_types (dict): Field types to which the SingleColumn metric will be applied. """ single_column_metric = None single_column_metric_kwargs = None field_types = None def __init__(self, single_column_metric=None, **single_column_metric_kwargs): self.single_column_metric = single_column_metric self.single_column_metric_kwargs = single_column_metric_kwargs self.compute = self._compute def _compute(self, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This is done by computing the underlying SingleColumn metric to all the columns that are compatible with it. The output is the average of the scores obtained. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the single column metric Returns: Union[float, tuple[float]]: Metric output. """ metadata = self._validate_inputs(real_data, synthetic_data, metadata) fields = self._select_fields(metadata, self.field_types) scores = [] for column_name, real_column in real_data.items(): if column_name in fields: real_column = real_column.values synthetic_column = synthetic_data[column_name].values score = self.single_column_metric.compute( real_column, synthetic_column, **(self.single_column_metric_kwargs or {}), **kwargs) scores.append(score) return np.nanmean(scores) @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This is done by computing the underlying SingleColumn metric to all the columns that are compatible with it. The output is the average of the scores obtained. Args: real_data (pandas.DataFrame): The values from the real dataset. synthetic_data (pandas.DataFrame): The values from the synthetic dataset. metadata (dict): Table metadata dict. **kwargs: Any additional keyword arguments will be passed down to the single column metric Returns: Union[float, tuple[float]]: Metric output. """ return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)
class MultiSingleTableMetric(MultiTableMetric, metaclass=NestedAttrsMeta('single_table_metric')): """MultiTableMetric subclass that applies a SingleTableMetric on each table. This class can either be used by creating a subclass that inherits from it and sets the SingleTable Metric as the ``single_table_metric`` attribute, or by creating an instance of this class passing the underlying SingleTable metric as an argument. Attributes: name (str): Name to use when reports about this metric are printed. goal (sdmetrics.goal.Goal): The goal of this metric. min_value (Union[float, tuple[float]]): Minimum value or values that this metric can take. max_value (Union[float, tuple[float]]): Maximum value or values that this metric can take. single_table_metric (sdmetrics.single_table.base.SingleTableMetric): SingleTableMetric to apply. """ single_table_metric = None def __init__(self, single_table_metric): self.single_table_metric = single_table_metric self.compute = self._compute def _compute(self, real_data, synthetic_data, metadata=None): """Compute this metric. This applies the underlying single table metric to all the tables found in the dataset and then returns the average score obtained. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, it is build based on the real_data fields and dtypes. **kwargs: Any additional keyword arguments will be passed down to the single table metric Returns: Union[float, tuple[float]]: Metric output. """ if set(real_data.keys()) != set(synthetic_data.keys()): raise ValueError('`real_data` and `synthetic_data` must have the same tables') if metadata is None: metadata = {'tables': defaultdict(type(None))} elif not isinstance(metadata, dict): metadata = metadata.to_dict() values = [] for table_name, real_table in real_data.items(): synthetic_table = synthetic_data[table_name] table_meta = metadata['tables'][table_name] score = self.single_table_metric.compute(real_table, synthetic_table, table_meta) values.append(score) return np.nanmean(values) @classmethod def compute(cls, real_data, synthetic_data, metadata=None, **kwargs): """Compute this metric. This applies the underlying single table metric to all the tables found in the dataset and then returns the average score obtained. Args: real_data (dict[str, pandas.DataFrame]): The tables from the real dataset. synthetic_data (dict[str, pandas.DataFrame]): The tables from the synthetic dataset. metadata (dict): Multi-table metadata dict. If not passed, it is build based on the real_data fields and dtypes. **kwargs: Any additional keyword arguments will be passed down to the single table metric Returns: Union[float, tuple[float]]: Metric output. """ return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)