def _run(args): _env_setup(args.logfile, args.verbose) if args.distributed: try: from dask.distributed import Client, LocalCluster except ImportError as ie: ie.msg += ('\n\nIt seems like `dask` is not installed.\n' 'Please install `dask` and `distributed` using:\n' '\n pip install dask distributed') raise client = Client( LocalCluster(n_workers=args.workers, threads_per_worker=args.threads)) client.register_worker_callbacks( lambda: _env_setup(args.logfile, args.verbose)) workers = 'dask' else: workers = args.workers synthesizers = sdgym.get_all_synthesizers() if args.models: synthesizers = {model: synthesizers[model] for model in args.models} lb = sdgym.run(synthesizers=synthesizers, datasets=args.datasets, iterations=args.iterations, output_path=args.output_path, cache_dir=args.cache_dir, workers=workers, show_progress=args.progress) if lb is not None: print(lb)
def test_identity(): output = sdgym.run( synthesizers=['Identity', 'Uniform'], datasets=['got_families', 'KRK_v1'], ) assert not output.empty assert set(output['modality'].unique()) == {'single-table', 'multi-table'} assert output[output.synthesizer == 'Identity'].score.mean() > 0.9 assert output[output.synthesizer == 'Uniform'].score.mean() < 0.8
def _run(args): _env_setup(args.logfile, args.verbose) if args.distributed: try: from dask.distributed import Client, LocalCluster except ImportError as ie: ie.msg += ('\n\nIt seems like `dask` is not installed.\n' 'Please install `dask` and `distributed` using:\n' '\n pip install dask distributed') raise processes = args.workers > 1 client = Client( LocalCluster( processes=processes, n_workers=args.workers, threads_per_worker=args.threads, ), ) client.register_worker_callbacks( lambda: _env_setup(args.logfile, args.verbose)) workers = 'dask' else: workers = args.workers if args.jobs: args.jobs = json.loads(args.jobs) scores = sdgym.run( synthesizers=args.synthesizers, datasets=args.datasets, datasets_path=args.datasets_path, modalities=args.modalities, metrics=args.metrics, bucket=args.bucket, iterations=args.iterations, cache_dir=args.cache_dir, workers=workers, show_progress=args.progress, timeout=args.timeout, output_path=args.output_path, aws_key=args.aws_key, aws_secret=args.aws_secret, jobs=args.jobs, max_rows=args.max_rows, max_columns=args.max_columns, ) if args.groupby: scores = scores.groupby(args.groupby).mean().reset_index() if scores is not None: _print_table(scores)
def test_identity(): output = sdgym.run( synthesizers=['Identity', 'Independent', 'Uniform'], datasets=['trains_v1', 'KRK_v1'], ) assert not output.empty assert set(output['modality'].unique()) == {'single-table', 'multi-table'} scores = output.groupby('synthesizer').score.mean().sort_values() assert ['Uniform', 'Independent', 'Identity'] == scores.index.tolist()
def test_identity_jobs(): jobs = [ ('Identity', 'trains_v1', 0), ('Independent', 'trains_v1', 1), ('Uniform', 'KRK_v1', 1), ] output = sdgym.run(jobs=jobs) assert not output.empty assert set(output['modality'].unique()) == {'single-table', 'multi-table'} columns = ['synthesizer', 'dataset', 'iteration'] combinations = set( tuple(record) for record in output[columns].drop_duplicates().to_records( index=False)) assert combinations == set(jobs)
def _run(args): _env_setup(args.logfile, args.verbose) if args.distributed: try: from dask.distributed import Client, LocalCluster except ImportError as ie: ie.msg += ( '\n\nIt seems like `dask` is not installed.\n' 'Please install `dask` and `distributed` using:\n' '\n pip install dask distributed' ) raise processes = args.workers > 1 client = Client( LocalCluster( processes=processes, n_workers=args.workers, threads_per_worker=args.threads, ), ) client.register_worker_callbacks(lambda: _env_setup(args.logfile, args.verbose)) workers = 'dask' else: workers = args.workers lb = sdgym.run( synthesizers=args.synthesizers, datasets=args.datasets, datasets_path=args.datasets_path, modalities=args.modalities, metrics=args.metrics, iterations=args.iterations, cache_dir=args.cache_dir, workers=workers, show_progress=args.progress, timeout=args.timeout, output_path=args.output_path, ) if lb is not None: _print_table(lb)
def test_json_synthesizer(): synthesizer = { 'name': 'synthesizer_name', 'synthesizer': 'sdgym.synthesizers.ydata.PreprocessedVanillaGAN', 'modalities': ['single-table'], 'init_kwargs': { 'categorical_transformer': 'label_encoding' }, 'fit_kwargs': { 'data': '$real_data' } } output = sdgym.run( synthesizers=[json.dumps(synthesizer)], datasets=['KRK_v1'], iterations=1, ) assert set(output['synthesizer']) == {'synthesizer_name'}
def test_json_synthesizer_multi_table(): synthesizer = { 'name': 'HMA1', 'synthesizer': 'sdv.relational.HMA1', 'modalities': ['multi-table'], 'init_kwargs': { 'metadata': '$metadata' }, 'fit_kwargs': { 'tables': '$real_data' } } output = sdgym.run( synthesizers=[json.dumps(synthesizer)], datasets=['university_v1', 'trains_v1'], iterations=1, ) # CSTest for `university_v1` is not valid because there are no categorical columns. valid_out = output.loc[~((output.dataset == 'university_v1') & (output.metric == 'CSTest'))] assert not valid_out.error.any()
else: continue print( f'ablation={ablation}, dataset_name={dataset_name}, current settings={current_settings}, seed={start_seed}' ) all_synthesizers.append( factory(deepcopy(current_settings), ablation, dataset_name)) try: datasets = [dataset_name] scores = run( synthesizers=all_synthesizers, datasets=datasets, iterations=1, add_leaderboard=False, cache_dir="ablation/", # workers=int(multiprocessing.cpu_count() / 2) ) time_str = time.strftime("%Y-%m-%d_%H-%M-%S") scores.to_csv(f"ablation/scores_{dataset_name}_{time_str}.csv") df = scores.copy(deep=True) total = df.loc[f'Synthsonic[all][{dataset_name}]'].copy() df = df - total.values.squeeze() df.loc[f'Synthsonic[all][{dataset_name}]'] = total df.to_csv(f"ablation/diff_scores_{dataset_name}_{time_str}.csv") except ValueError: print(f"Failed to compute {dataset_name} scores") del all_synthesizers