Beispiel #1
0
def _run(args):
    _env_setup(args.logfile, args.verbose)

    if args.distributed:
        try:
            from dask.distributed import Client, LocalCluster
        except ImportError as ie:
            ie.msg += ('\n\nIt seems like `dask` is not installed.\n'
                       'Please install `dask` and `distributed` using:\n'
                       '\n    pip install dask distributed')
            raise

        client = Client(
            LocalCluster(n_workers=args.workers,
                         threads_per_worker=args.threads))
        client.register_worker_callbacks(
            lambda: _env_setup(args.logfile, args.verbose))

        workers = 'dask'
    else:
        workers = args.workers

    synthesizers = sdgym.get_all_synthesizers()
    if args.models:
        synthesizers = {model: synthesizers[model] for model in args.models}

    lb = sdgym.run(synthesizers=synthesizers,
                   datasets=args.datasets,
                   iterations=args.iterations,
                   output_path=args.output_path,
                   cache_dir=args.cache_dir,
                   workers=workers,
                   show_progress=args.progress)
    if lb is not None:
        print(lb)
Beispiel #2
0
def test_identity():
    output = sdgym.run(
        synthesizers=['Identity', 'Uniform'],
        datasets=['got_families', 'KRK_v1'],
    )

    assert not output.empty
    assert set(output['modality'].unique()) == {'single-table', 'multi-table'}
    assert output[output.synthesizer == 'Identity'].score.mean() > 0.9
    assert output[output.synthesizer == 'Uniform'].score.mean() < 0.8
Beispiel #3
0
def _run(args):
    _env_setup(args.logfile, args.verbose)

    if args.distributed:
        try:
            from dask.distributed import Client, LocalCluster
        except ImportError as ie:
            ie.msg += ('\n\nIt seems like `dask` is not installed.\n'
                       'Please install `dask` and `distributed` using:\n'
                       '\n    pip install dask distributed')
            raise

        processes = args.workers > 1
        client = Client(
            LocalCluster(
                processes=processes,
                n_workers=args.workers,
                threads_per_worker=args.threads,
            ), )
        client.register_worker_callbacks(
            lambda: _env_setup(args.logfile, args.verbose))

        workers = 'dask'
    else:
        workers = args.workers

    if args.jobs:
        args.jobs = json.loads(args.jobs)

    scores = sdgym.run(
        synthesizers=args.synthesizers,
        datasets=args.datasets,
        datasets_path=args.datasets_path,
        modalities=args.modalities,
        metrics=args.metrics,
        bucket=args.bucket,
        iterations=args.iterations,
        cache_dir=args.cache_dir,
        workers=workers,
        show_progress=args.progress,
        timeout=args.timeout,
        output_path=args.output_path,
        aws_key=args.aws_key,
        aws_secret=args.aws_secret,
        jobs=args.jobs,
        max_rows=args.max_rows,
        max_columns=args.max_columns,
    )

    if args.groupby:
        scores = scores.groupby(args.groupby).mean().reset_index()

    if scores is not None:
        _print_table(scores)
Beispiel #4
0
def test_identity():
    output = sdgym.run(
        synthesizers=['Identity', 'Independent', 'Uniform'],
        datasets=['trains_v1', 'KRK_v1'],
    )

    assert not output.empty
    assert set(output['modality'].unique()) == {'single-table', 'multi-table'}

    scores = output.groupby('synthesizer').score.mean().sort_values()

    assert ['Uniform', 'Independent', 'Identity'] == scores.index.tolist()
Beispiel #5
0
def test_identity_jobs():
    jobs = [
        ('Identity', 'trains_v1', 0),
        ('Independent', 'trains_v1', 1),
        ('Uniform', 'KRK_v1', 1),
    ]
    output = sdgym.run(jobs=jobs)

    assert not output.empty
    assert set(output['modality'].unique()) == {'single-table', 'multi-table'}

    columns = ['synthesizer', 'dataset', 'iteration']
    combinations = set(
        tuple(record)
        for record in output[columns].drop_duplicates().to_records(
            index=False))

    assert combinations == set(jobs)
Beispiel #6
0
def _run(args):
    _env_setup(args.logfile, args.verbose)

    if args.distributed:
        try:
            from dask.distributed import Client, LocalCluster
        except ImportError as ie:
            ie.msg += (
                '\n\nIt seems like `dask` is not installed.\n'
                'Please install `dask` and `distributed` using:\n'
                '\n    pip install dask distributed'
            )
            raise

        processes = args.workers > 1
        client = Client(
            LocalCluster(
                processes=processes,
                n_workers=args.workers,
                threads_per_worker=args.threads,
            ),
        )
        client.register_worker_callbacks(lambda: _env_setup(args.logfile, args.verbose))

        workers = 'dask'
    else:
        workers = args.workers

    lb = sdgym.run(
        synthesizers=args.synthesizers,
        datasets=args.datasets,
        datasets_path=args.datasets_path,
        modalities=args.modalities,
        metrics=args.metrics,
        iterations=args.iterations,
        cache_dir=args.cache_dir,
        workers=workers,
        show_progress=args.progress,
        timeout=args.timeout,
        output_path=args.output_path,
    )
    if lb is not None:
        _print_table(lb)
Beispiel #7
0
def test_json_synthesizer():
    synthesizer = {
        'name': 'synthesizer_name',
        'synthesizer': 'sdgym.synthesizers.ydata.PreprocessedVanillaGAN',
        'modalities': ['single-table'],
        'init_kwargs': {
            'categorical_transformer': 'label_encoding'
        },
        'fit_kwargs': {
            'data': '$real_data'
        }
    }

    output = sdgym.run(
        synthesizers=[json.dumps(synthesizer)],
        datasets=['KRK_v1'],
        iterations=1,
    )

    assert set(output['synthesizer']) == {'synthesizer_name'}
Beispiel #8
0
def test_json_synthesizer_multi_table():
    synthesizer = {
        'name': 'HMA1',
        'synthesizer': 'sdv.relational.HMA1',
        'modalities': ['multi-table'],
        'init_kwargs': {
            'metadata': '$metadata'
        },
        'fit_kwargs': {
            'tables': '$real_data'
        }
    }

    output = sdgym.run(
        synthesizers=[json.dumps(synthesizer)],
        datasets=['university_v1', 'trains_v1'],
        iterations=1,
    )

    # CSTest for `university_v1` is not valid because there are no categorical columns.
    valid_out = output.loc[~((output.dataset == 'university_v1') &
                             (output.metric == 'CSTest'))]

    assert not valid_out.error.any()
            else:
                continue

        print(
            f'ablation={ablation}, dataset_name={dataset_name}, current settings={current_settings}, seed={start_seed}'
        )

        all_synthesizers.append(
            factory(deepcopy(current_settings), ablation, dataset_name))

    try:
        datasets = [dataset_name]
        scores = run(
            synthesizers=all_synthesizers,
            datasets=datasets,
            iterations=1,
            add_leaderboard=False,
            cache_dir="ablation/",
            # workers=int(multiprocessing.cpu_count() / 2)
        )
        time_str = time.strftime("%Y-%m-%d_%H-%M-%S")
        scores.to_csv(f"ablation/scores_{dataset_name}_{time_str}.csv")

        df = scores.copy(deep=True)
        total = df.loc[f'Synthsonic[all][{dataset_name}]'].copy()
        df = df - total.values.squeeze()
        df.loc[f'Synthsonic[all][{dataset_name}]'] = total
        df.to_csv(f"ablation/diff_scores_{dataset_name}_{time_str}.csv")
    except ValueError:
        print(f"Failed to compute {dataset_name} scores")
    del all_synthesizers