def _score_dataset(dataset, datasets_path, output): start = datetime.now() try: if datasets_path is None: metadata, tables = load_demo(dataset, metadata=True) else: metadata = Metadata( os.path.join(datasets_path, dataset, 'metadata.json')) tables = metadata.load_tables() sdv = SDV() LOGGER.info('Modeling dataset %s', dataset) sdv.fit(metadata, tables) LOGGER.info('Sampling dataset %s', dataset) sampled = sdv.sample_all(10) LOGGER.info('Evaluating dataset %s', dataset) score = evaluate(sampled, metadata=metadata) LOGGER.info('%s: %s - ELAPSED: %s', dataset, score, datetime.now() - start) output.update({ 'dataset': dataset, 'score': score, }) except Exception as ex: error = '{}: {}'.format(type(ex).__name__, str(ex)) LOGGER.error('%s: %s - ELAPSED: %s', dataset, error, datetime.now() - start) output.update({'dataset': dataset, 'error': error})
def test_ctgan(): users = load_demo(metadata=False)['users'] ctgan = CTGAN(primary_key='user_id', epochs=1) ctgan.fit(users) sampled = ctgan.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) assert ctgan.get_metadata().to_dict() == { 'fields': { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } }, 'constraints': [], 'model_kwargs': {} }
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, categorical_transformer='one_hot_encoding', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula( table_metadata=gc.get_metadata(), categorical_transformer='one_hot_encoding', ) new_gc.set_parameters(parameters) sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer' }, 'country': { 'type': 'categorical' }, 'gender': { 'type': 'categorical' }, 'age': { 'type': 'numerical', 'subtype': 'integer' } } assert 'model_kwargs' in metadata
def test_recreate(): data = load_demo(metadata=False)['users'] # If distribution is non parametric, get_parameters fails model = CTGAN(epochs=1) model.fit(data) sampled = model.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata model_meta = CTGAN(epochs=1, table_metadata=model.get_metadata()) model_meta.fit(data) sampled = model_meta.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all() # Metadata dict model_meta_dict = CTGAN(epochs=1, table_metadata=model.get_metadata().to_dict()) model_meta_dict.fit(data) sampled = model_meta_dict.sample() assert sampled.shape == data.shape assert (sampled.dtypes == data.dtypes).all() assert (sampled.notnull().sum(axis=1) != 0).all()
def test_copulagan(): users = load_demo(metadata=False)['users'] model = CopulaGAN( primary_key='user_id', epochs=1, field_distributions={ 'age': 'beta' }, default_distribution='bounded' ) model.fit(users) sampled = model.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) assert model.get_metadata().to_dict() == { 'fields': { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'transformer': 'label_encoding', }, 'gender': { 'type': 'categorical', 'transformer': 'label_encoding', }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } }, 'primary_key': 'user_id', 'constraints': [], 'sequence_index': None, 'context_columns': [], 'entity_columns': [], 'model_kwargs': {}, 'name': None }
def test_ctgan(): users = load_demo(metadata=False)['users'] ctgan = CTGAN( primary_key='user_id', epochs=1 ) ctgan.fit(users) sampled = ctgan.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) expected_metadata = { 'fields': { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'transformer': None, }, 'gender': { 'type': 'categorical', 'transformer': None, }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } }, 'primary_key': 'user_id', 'constraints': [], 'sequence_index': None, 'context_columns': [], 'entity_columns': [], 'model_kwargs': {}, 'name': None } assert ctgan.get_metadata().to_dict() == expected_metadata
def test_integer_categoricals(): """Ensure integer categoricals are still sampled as integers. The origin of this tests can be found in the github issue #194: https://github.com/sdv-dev/SDV/issues/194 """ users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'categorical', }, } gc = GaussianCopula(field_types=field_types, categorical_transformer='categorical') gc.fit(users) sampled = gc.sample() assert users['age'].dtype == np.int64 assert sampled['age'].dtype == np.int64
def test_sdv_model_kwargs(): metadata, tables = load_demo(metadata=True) tables = {'users': tables['users']} metadata = metadata.to_dict() del metadata['tables']['sessions'] del metadata['tables']['transactions'] hma = HMA1(metadata, model=GaussianCopula, model_kwargs={ 'default_distribution': 'beta', 'categorical_transformer': 'label_encoding', }) hma.fit(tables) model = hma._models['users'] assert model._default_distribution == BetaUnivariate assert model._DTYPE_TRANSFORMERS['O'] == 'label_encoding' assert isinstance( model._metadata._hyper_transformer._transformers['gender'], rdt.transformers.categorical.LabelEncodingTransformer )
def test_gaussian_copula(): users = load_demo(metadata=False)['users'] field_types = { 'age': { 'type': 'numerical', 'subtype': 'integer', }, 'country': { 'type': 'categorical' } } anonymize_fields = {'country': 'country_code'} # If distribution is non parametric, get_parameters fails gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='gaussian_kde', ) gc.fit(users) with pytest.raises(NonParametricError): parameters = gc.get_parameters() # If distribution is parametric, copula can be recreated gc = GaussianCopula( field_names=['user_id', 'country', 'gender', 'age'], field_types=field_types, primary_key='user_id', anonymize_fields=anonymize_fields, field_distributions={'age': 'gamma'}, default_distribution='bounded', ) gc.fit(users) parameters = gc.get_parameters() new_gc = GaussianCopula(table_metadata=gc.get_metadata(), ) new_gc.set_parameters(parameters) # Validate sampled dat sampled = new_gc.sample() # test shape is right assert sampled.shape == users.shape # test user_id has been generated as an ID field assert list(sampled['user_id']) == list(range(0, len(users))) # country codes have been replaced with new ones assert set(sampled.country.unique()) != set(users.country.unique()) # Validate metadata metadata = gc.get_metadata().to_dict() assert metadata['fields'] == { 'user_id': { 'type': 'id', 'subtype': 'integer', 'transformer': 'integer', }, 'country': { 'type': 'categorical', 'pii': True, 'pii_category': 'country_code', 'transformer': 'one_hot_encoding', }, 'gender': { 'type': 'categorical', 'transformer': 'one_hot_encoding', }, 'age': { 'type': 'numerical', 'subtype': 'integer', 'transformer': 'integer', } } assert 'model_kwargs' in metadata assert 'GaussianCopula' in metadata['model_kwargs']