Beispiel #1
0
def evaluate(synthetic_data,
             real_data=None,
             metadata=None,
             root_path=None,
             table_name=None,
             metrics=None,
             aggregate=True):
    """Apply multiple metrics at once.

    Args:
        synthetic_data (dict[str, pandas.DataFrame] or pandas.DataFrame):
            Map of names and tables of synthesized data. When evaluating a single table,
            a single ``pandas.DataFrame`` can be passed alone.
        real_data (dict[str, pandas.DataFrame] or pandas.DataFrame):
            Map of names and tables of real data. When evaluating a single table,
            a single ``pandas.DataFrame`` can be passed alone.
        metadata (str, dict, Metadata or None):
            Metadata instance or details needed to build it.
        root_path (str):
            Relative path to find the metadata.json file when needed.
        metrics (list[str]):
            List of metric names to apply.
        table_name (str):
            Table name to be evaluated, only used when ``synthetic_data`` is a
            ``pandas.DataFrame`` and ``real_data`` is ``None``.
        aggregate (bool):
            If ``get_report`` is ``False``, whether to compute the mean of all the scores to
            return a single float value or return a ``dict`` containing the score that each
            metric obtained. Defaults to ``True``.

    Return:
        float or sdmetrics.MetricsReport
    """
    metrics, modality = _select_metrics(synthetic_data, metrics)

    synthetic_data, real_data, metadata = _validate_arguments(
        synthetic_data, real_data, metadata, root_path, table_name)

    if modality == 'single-table':
        table = list(metadata['tables'].keys())[0]
        metadata = metadata['tables'][table]
        real_data = real_data[table]
        synthetic_data = synthetic_data[table]

    scores = sdmetrics.compute_metrics(metrics,
                                       real_data,
                                       synthetic_data,
                                       metadata=metadata)
    scores.dropna(inplace=True)

    if aggregate:
        infinites = (scores.min_value == -np.inf) & (scores.max_value
                                                     == np.inf)
        scores.loc[infinites, 'score'] = np.tanh(scores.loc[infinites,
                                                            'score'])

        return scores.score.mean()

    return scores
Beispiel #2
0
def test_compute_all():
    real_data, synthetic_data, metadata = load_multi_table_demo()

    output = compute_metrics(MultiTableMetric.get_subclasses(),
                             real_data,
                             synthetic_data,
                             metadata=metadata)

    assert not pd.isna(output.raw_score.mean())

    scores = output[output.raw_score.notna()]

    assert scores.raw_score.between(scores.min_value, scores.max_value).all()
Beispiel #3
0
def test_compute_all():
    real_data, synthetic_data, metadata = load_timeseries_demo()

    output = compute_metrics(
        TimeSeriesMetric.get_subclasses(),
        real_data,
        synthetic_data,
        metadata=metadata
    )

    assert not pd.isnull(output.score.mean())

    scores = output[output.score.notnull()]

    assert scores.score.between(scores.min_value, scores.max_value).all()
def test_compute_all():
    real_data, synthetic_data, metadata = load_single_table_demo()

    output = compute_metrics(
        SingleTableMetric.get_subclasses(),
        real_data,
        synthetic_data,
        metadata=metadata
    )

    assert not pd.isnull(output.raw_score.mean())

    scores = output[output.raw_score.notnull()]
    assert scores.raw_score.between(scores.min_value, scores.max_value).all()

    scores = output[output.normalized_score.notnull()]
    assert scores.normalized_score.between(0.0, 1.0).all()