Esempio n. 1
0
class ParentChildDetectionMetric(
        DetectionMetric, metaclass=NestedAttrsMeta('single_table_metric')):
    """Base class for Multi-table Detection metrics based on parent-child relationships.

    These metrics denormalize the parent-child relationships from the dataset and then
    apply a Single Table Detection metric on the resulting tables.

    The output of the metric is one minus the average ROC AUC score obtained.

    A part from the real and synthetic data, these metrics need to be passed
    a list with the foreign key relationships that exist between the tables.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        single_table_metric (sdmetrics.single_table.detection.base.DetectionMetric):
            The single table detection metric to use.
    """

    single_table_metric = None

    @staticmethod
    def _extract_foreign_keys(metadata):
        if not isinstance(metadata, dict):
            metadata = metadata.to_dict()

        foreign_keys = []
        for child_table, child_meta in metadata['tables'].items():
            for child_key, field_meta in child_meta['fields'].items():
                ref = field_meta.get('ref')
                if ref:
                    foreign_keys.append(
                        (ref['table'], ref['field'], child_table, child_key))

        return foreign_keys

    @staticmethod
    def _denormalize(data, foreign_key):
        """Denormalize the child table over the parent."""
        parent_table, parent_key, child_table, child_key = foreign_key

        flat = data[parent_table].merge(data[child_table],
                                        how='outer',
                                        left_on=parent_key,
                                        right_on=child_key)

        del flat[parent_key]
        if child_key != parent_key:
            del flat[child_key]

        return flat

    @classmethod
    def compute(cls,
                real_data,
                synthetic_data,
                metadata=None,
                foreign_keys=None):
        """Compute this metric.

        This denormalizes the parent-child relationships from the dataset and then
        applies a Single Table Detection metric on the resulting tables.

        The output of the metric is one minus the average ROC AUC score obtained.

        A part from the real and synthetic data, either a ``foreign_keys`` list
        containing the relationships between the tables or a ``metadata`` that can be
        used to create such list must be passed.

        Args:
            real_data (dict[str, pandas.DataFrame]):
                The tables from the real dataset.
            synthetic_data (dict[str, pandas.DataFrame]):
                The tables from the synthetic dataset.
            metadata (dict):
                Multi-table metadata dict. If not passed, foreign keys must be
                passed.
            foreign_keys (list[tuple[str, str, str, str]]):
                List of foreign key relationships specified as tuples
                that contain (parent_table, parent_key, child_table, child_key).
                Ignored if metada is given.

        Returns:
            float:
                Average of the scores obtained by the single table metric.
        """
        if metadata:
            foreign_keys = cls._extract_foreign_keys(metadata)
        if not foreign_keys:
            raise ValueError('No foreign keys given')

        scores = []
        for foreign_key in foreign_keys:
            real = cls._denormalize(real_data, foreign_key)
            synth = cls._denormalize(synthetic_data, foreign_key)
            scores.append(cls.single_table_metric.compute(real, synth))

        return np.mean(scores)
Esempio n. 2
0
class MultiColumnPairsMetric(SingleTableMetric,
                             metaclass=NestedAttrsMeta('column_pairs_metric')):
    """SingleTableMetric subclass that applies a ColumnPairsMetric on each possible column pair.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        column_pairs_metric (sdmetrics.column_pairs.base.ColumnPairsMetric):
            ColumnPairsMetric to apply.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    column_pairs_metric = None
    column_pairs_metric_kwargs = None
    field_types = None

    def __init__(self, column_pairs_metric, **column_pairs_metric_kwargs):
        self.column_pairs_metric = column_pairs_metric
        self.column_pairs_metric_kwargs = column_pairs_metric_kwargs
        self.compute = self._compute

    def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        This is done by grouping all the columns that are compatible with the
        underlying ColumnPairs metric in groups of 2 and then evaluating them
        using the ColumnPairs metric.

        The output is the average of the scores obtained.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the column pairs metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        metadata = self._validate_inputs(real_data, synthetic_data, metadata)

        fields = self._select_fields(metadata, self.field_types)

        values = []
        for columns in combinations(fields, r=2):
            real = real_data[list(columns)]
            synthetic = synthetic_data[list(columns)]
            values.append(self.column_pairs_metric.compute(real, synthetic))

        return np.nanmean(values)

    @classmethod
    def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        Args:
            real_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the real dataset.
            synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the column pairs metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)

    @classmethod
    def normalize(cls, raw_score):
        """Returns the `raw_score` as is, since it is already normalized.

        Args:
            raw_score (float):
                The value of the metric from `compute`.

        Returns:
            float:
                The normalized value of the metric
        """
        assert cls.min_value == 0.0
        return super().normalize(raw_score)
class MultiSingleColumnMetric(SingleTableMetric,
                              metaclass=NestedAttrsMeta('single_column_metric')
                              ):
    """SingleTableMetric subclass that applies a SingleColumnMetric on each column.

    This class can either be used by creating a subclass that inherits from it and
    sets the SingleColumn Metric as the ``single_column_metric`` attribute,
    or by creating an instance of this class passing the underlying SingleColumn
    metric as an argument.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        single_column_metric (sdmetrics.single_column.base.SingleColumnMetric):
            SingleColumn metric to apply.
        field_types (dict):
            Field types to which the SingleColumn metric will be applied.
    """

    single_column_metric = None
    single_column_metric_kwargs = None
    field_types = None

    def __init__(self,
                 single_column_metric=None,
                 **single_column_metric_kwargs):
        self.single_column_metric = single_column_metric
        self.single_column_metric_kwargs = single_column_metric_kwargs
        self.compute = self._compute

    def _compute(self, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        This is done by computing the underlying SingleColumn metric to all the
        columns that are compatible with it.

        The output is the average of the scores obtained.

        Args:
            real_data (pandas.DataFrame):
                The values from the real dataset.
            synthetic_data (pandas.DataFrame):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the single column metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        metadata = self._validate_inputs(real_data, synthetic_data, metadata)

        fields = self._select_fields(metadata, self.field_types)
        scores = []
        for column_name, real_column in real_data.items():
            if column_name in fields:
                real_column = real_column.values
                synthetic_column = synthetic_data[column_name].values

                score = self.single_column_metric.compute(
                    real_column, synthetic_column,
                    **(self.single_column_metric_kwargs or {}), **kwargs)
                scores.append(score)

        return np.nanmean(scores)

    @classmethod
    def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        This is done by computing the underlying SingleColumn metric to all the
        columns that are compatible with it.

        The output is the average of the scores obtained.

        Args:
            real_data (pandas.DataFrame):
                The values from the real dataset.
            synthetic_data (pandas.DataFrame):
                The values from the synthetic dataset.
            metadata (dict):
                Table metadata dict.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the single column metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)
Esempio n. 4
0
class MultiSingleTableMetric(MultiTableMetric, metaclass=NestedAttrsMeta('single_table_metric')):
    """MultiTableMetric subclass that applies a SingleTableMetric on each table.

    This class can either be used by creating a subclass that inherits from it and
    sets the SingleTable Metric as the ``single_table_metric`` attribute,
    or by creating an instance of this class passing the underlying SingleTable
    metric as an argument.

    Attributes:
        name (str):
            Name to use when reports about this metric are printed.
        goal (sdmetrics.goal.Goal):
            The goal of this metric.
        min_value (Union[float, tuple[float]]):
            Minimum value or values that this metric can take.
        max_value (Union[float, tuple[float]]):
            Maximum value or values that this metric can take.
        single_table_metric (sdmetrics.single_table.base.SingleTableMetric):
            SingleTableMetric to apply.
    """

    single_table_metric = None

    def __init__(self, single_table_metric):
        self.single_table_metric = single_table_metric
        self.compute = self._compute

    def _compute(self, real_data, synthetic_data, metadata=None):
        """Compute this metric.

        This applies the underlying single table metric to all the tables
        found in the dataset and then returns the average score obtained.

        Args:
            real_data (dict[str, pandas.DataFrame]):
                The tables from the real dataset.
            synthetic_data (dict[str, pandas.DataFrame]):
                The tables from the synthetic dataset.
            metadata (dict):
                Multi-table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the single table metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        if set(real_data.keys()) != set(synthetic_data.keys()):
            raise ValueError('`real_data` and `synthetic_data` must have the same tables')

        if metadata is None:
            metadata = {'tables': defaultdict(type(None))}
        elif not isinstance(metadata, dict):
            metadata = metadata.to_dict()

        values = []
        for table_name, real_table in real_data.items():
            synthetic_table = synthetic_data[table_name]
            table_meta = metadata['tables'][table_name]

            score = self.single_table_metric.compute(real_table, synthetic_table, table_meta)
            values.append(score)

        return np.nanmean(values)

    @classmethod
    def compute(cls, real_data, synthetic_data, metadata=None, **kwargs):
        """Compute this metric.

        This applies the underlying single table metric to all the tables
        found in the dataset and then returns the average score obtained.

        Args:
            real_data (dict[str, pandas.DataFrame]):
                The tables from the real dataset.
            synthetic_data (dict[str, pandas.DataFrame]):
                The tables from the synthetic dataset.
            metadata (dict):
                Multi-table metadata dict. If not passed, it is build based on the
                real_data fields and dtypes.
            **kwargs:
                Any additional keyword arguments will be passed down
                to the single table metric

        Returns:
            Union[float, tuple[float]]:
                Metric output.
        """
        return cls._compute(cls, real_data, synthetic_data, metadata, **kwargs)