def independent_synthesis(json = 'adult'):
    data, categorical_columns, ordinal_columns = load_dataset(json)
    synthesizer = IndependentSynthesizer()
    synthesizer.fit(data, categorical_columns, ordinal_columns)
    sampled = synthesizer.sample(10)
    print(sampled)
    np.savetxt('test.txt', sampled, delimiter=',') 
    return sampled
def independent_benchmark(json = 'adult'):
    train, test, meta, categoricals, ordinals = load_dataset(json, benchmark=True)
    synthesizer = IndependentSynthesizer()
    synthesizer.fit(train, categoricals, ordinals)
    sampled = synthesizer.sample(300)
    print('Sampled Data for 300 records\n')
    scores = evaluate(train, test, sampled, meta)
    print('\nEvaluation Scores from evaluate function:\n')
    return scores
Example #3
0
def compute_benchmark(synthesizer, datasets=DEFAULT_DATASETS, iterations=3):
    """Compute the scores of a synthesizer over a list of datasets.

    The results are returned in a raw format as a ``pandas.DataFrame`` containing:
        - One row for each dataset+scoring method (for example, a classifier)
        - One column for each computed metric
        - The columns:
            - dataset
            - distance
            - name (of the scoring method)
            - iteration

    For example, evaluating a synthesizer on the ``adult`` and ``asia`` datasets with 2
    iterations produces a table similar to this::

        dataset             name  iter  distance  accuracy    f1  syn_likelihood  test_likelihood
          adult  DecisionTree...     0       0.0      0.79  0.65             NaN              NaN
          adult      AdaBoost...     0       0.0      0.85  0.67             NaN              NaN
          adult      Logistic...     0       0.0      0.79  0.66             NaN              NaN
          adult           MLP...     0       0.0      0.84  0.67             NaN              NaN
          adult  DecisionTree...     1       0.0      0.80  0.66             NaN              NaN
          adult      AdaBoost...     1       0.0      0.86  0.68             NaN              NaN
          adult      Logistic...     1       0.0      0.79  0.65             NaN              NaN
          adult           MLP...     1       0.0      0.84  0.64             NaN              NaN
           asia     Bayesian ...     0       0.0       NaN   NaN           -2.23            -2.24
           asia     Bayesian ...     1       0.0       NaN   NaN           -2.23            -2.24
    """
    results = list()
    for dataset_name in datasets:
        LOGGER.info('Evaluating dataset %s', dataset_name)
        train, test, meta, categoricals, ordinals = load_dataset(
            dataset_name, benchmark=True)

        for iteration in range(iterations):
            try:
                start = timer()
                synthesized = synthesizer(train, categoricals, ordinals)
                end = timer()
                scores = compute_scores(train, test, synthesized, meta)
                scores['dataset'] = dataset_name
                scores['iteration'] = iteration
                scores['exec_time (s)'] = end - start
                results.append(scores)
            except Exception:
                LOGGER.exception(
                    'Error computing scores for %s on dataset %s - iteration %s',
                    _get_synthesizer_name(synthesizer), dataset_name,
                    iteration)

    return pd.concat(results, sort=False)
Example #4
0
def benchmark(synthesizer, datasets=DEFAULT_DATASETS, repeat=3):
    results = list()
    for name in datasets:
        LOGGER.info('Evaluating dataset %s', name)
        train, test, meta, categoricals, ordinals = load_dataset(
            name, benchmark=True)

        for iteration in range(repeat):
            synthesized = synthesizer(train, categoricals, ordinals)
            scores = evaluate(train, test, synthesized, meta)
            scores['dataset'] = name
            scores['iter'] = iteration
            results.append(scores)

    return pd.concat(results)
def benchCLBNSynthesizer():
    from sdgym.synthesizers import CLBNSynthesizer
    from sdgym.evaluate import evaluate

    from sdgym.data import load_dataset
    train, test, meta, categoricals, ordinals = load_dataset('adult',
                                                             benchmark=True)
    synthesizer = CLBNSynthesizer()
    synthesizer.fit(train, categoricals, ordinals)
    sampled = synthesizer.sample(300)
    scores = evaluate(train, test, sampled, meta)
    #scores = scores.append(evaluate(train, test, sampled, meta))
    #scores = scores.append(evaluate(train, test, sampled, meta))
    print('\nEvaluation Scores from evaluate function:\n')
    print(scores)
    scores['Synth'] = 'CLBNSynthesizer'
    scores.to_csv('CLBNBench.csv')
Example #6
0
def benchmark(synthesizer, datasets=DEFAULT_DATASETS, repeat=3, prefix='tmp'):
    print(datasets)
    results = list()
    for name in datasets:
        try:
            print('Evaluating dataset %s', name)
            train, test, meta, categoricals, ordinals = load_dataset(
                name, benchmark=True)

            for iteration in range(repeat):
                synthesized = synthesizer(train, categoricals, ordinals)
                scores = evaluate(train, test, synthesized, meta)
                scores['dataset'] = name
                scores['iter'] = iteration
                results.append(scores)
            print(results)
            with open(f'{prefix}_{name}.pickle', 'wb') as f:
                pickle.dump(results, f)
        except KeyError:
            print("Here is the KeyError")
            continue

    return pd.concat(results)
Example #7
0
def _score_synthesizer_on_dataset(name, synthesizer, dataset_name, iteration,
                                  cache_dir):
    try:
        LOGGER.info('Evaluating %s on dataset %s; iteration %s; %s', name,
                    dataset_name, iteration, _used_memory())

        train, test, meta, categoricals, ordinals = load_dataset(
            dataset_name, benchmark=True)
        if isinstance(synthesizer, type) and issubclass(
                synthesizer, BaseSynthesizer):
            synthesizer = synthesizer().fit_sample

        LOGGER.info('Running %s on dataset %s; iteration %s; %s', name,
                    dataset_name, iteration, _used_memory())

        synthesized = synthesizer(train, categoricals, ordinals)

        LOGGER.info('Scoring %s on dataset %s; iteration %s; %s', name,
                    dataset_name, iteration, _used_memory())
        scores = compute_scores(train, test, synthesized, meta)
        scores['dataset'] = dataset_name
        scores['iteration'] = iteration
        scores['synthesizer'] = name

        if cache_dir:
            csv_name = f'{name}_{dataset_name}_{iteration}.csv'
            scores.to_csv(os.path.join(cache_dir, csv_name))

        return scores
    except Exception:
        LOGGER.exception('Error running %s on dataset %s; iteration %s', name,
                         dataset_name, iteration)

    finally:
        LOGGER.info('Finished %s on dataset %s; iteration %s; %s', name,
                    dataset_name, iteration, _used_memory())
def uniform_synthesis(json='adult'):
    data, categorical_columns, ordinal_columns = load_dataset(json)
    synthesizer = UniformSynthesizer()
    synthesizer.fit(data, categorical_columns, ordinal_columns)
    sampled = synthesizer.sample(10)
    sampled
def identity_synthesis(json='adult'):
    data, categorical_columns, ordinal_columns = load_dataset(json)
    synthesizer = IdentitySynthesizer()
    synthesizer.fit(data, categorical_columns, ordinal_columns)
    sampled = synthesizer.sample(10)
    sampled
def clbn_synthesis(json = 'adult'):
    data, categorical_columns, ordinal_columns = load_dataset(json)
    synthesizer = CLBNSynthesizer()
    synthesizer.fit(data, categorical_columns, ordinal_columns)
    sampled = synthesizer.sample(10)
    return sampled
def independent_synthesis(json = 'adult'):
    data, categorical_columns, ordinal_columns = load_dataset(json)
    synthesizer = IndependentSynthesizer()
    synthesizer.fit(data, categorical_columns, ordinal_columns)
    sampled = synthesizer.sample(10)
    return sampled
def get_label_col(dataset_name):
    _, _, meta, _, _ = load_dataset(dataset_name, benchmark=True)
    for idx, c in enumerate(meta['columns']):
        if c['name'] == 'label':
            return idx
    return -1