def test_sample_all(self): """Test sample all regenerating the primary keys""" # Setup def sample_side_effect(table, num_rows): return {table: pd.DataFrame({'foo': range(num_rows)})} sampler = Mock(spec=Sampler) sampler.metadata.get_tables.return_value = [ 'table a', 'table b', 'table c' ] sampler.metadata.get_parents.side_effect = [False, True, False] sampler.sample.side_effect = sample_side_effect # Run result = Sampler.sample_all(sampler, num_rows=3, reset_primary_keys=True) # Asserts assert sampler.metadata.get_parents.call_count == 3 assert sampler._reset_primary_keys_generators.call_count == 1 pd.testing.assert_frame_equal(result['table a'], pd.DataFrame({'foo': range(3)})) pd.testing.assert_frame_equal(result['table c'], pd.DataFrame({'foo': range(3)}))
def test_sample_all(self, rows_mock): """Check sample_all and returns some value.""" # Setup data_navigator = MagicMock() data_navigator.tables = ['TABLE_A', 'TABLE_B'] data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' modeler = MagicMock() sampler = Sampler(data_navigator, modeler) def fake_dataframe(*args, **kwargs): kwargs['sampled_data'][args[1]] = 'sampled_data' rows_mock.side_effect = fake_dataframe expected_get_parents_call_list = [(('TABLE_A', ), {}), (('TABLE_B', ), {})] expected_result = {'TABLE_A': 'sampled_data'} # Run result = sampler.sample_all(num_rows=5) # Check assert result == expected_result assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list rows_mock.assert_called_once_with( sampler, 'TABLE_A', 5, sampled_data={'TABLE_A': 'sampled_data'})
def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock): """Check sample_all and returns some value.""" # Setup data_navigator = MagicMock() data_navigator.tables = ['TABLE_A', 'TABLE_B'] data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A' modeler = MagicMock() sampler = Sampler(data_navigator, modeler) def fake_dataframe(name, number): return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number) rows_mock.side_effect = fake_dataframe concat_mock.return_value = 'concatenated_dataframe' expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})] expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)] # Run result = sampler.sample_all(num_rows=5) # Check assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list assert result == reset_mock.return_value assert rows_mock.call_args_list == expected_rows_mock_call_list assert child_mock.call_count == 5 reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'})
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'A', 'type': 'categorical' }, 'column_B': { 'name': 'B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(data, meta) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } data_navigator.meta = { 'tables': [ { 'name': meta } ] } data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
def test_sample_all_with_reset_primary_key(self): """Check sample_all with reset_primary_keys True""" # Setup reset_primary_keys_generators_mock = Mock() dn_mock = Mock() dn_mock.tables = {'DEMO': Table(pd.DataFrame(), {'some': 'meta'})} dn_mock.get_parents.return_value = True # Run sampler_mock = Mock() sampler_mock._reset_primary_keys_generators = reset_primary_keys_generators_mock sampler_mock.dn = dn_mock Sampler.sample_all(sampler_mock, reset_primary_keys=True) # Asserts reset_primary_keys_generators_mock.assert_called_once_with()
class SDV: """Class to do modeling and sampling all in one.""" def __init__(self, meta_file_name, data_loader_type='csv'): """Initialize sdv class.""" self.meta_file_name = meta_file_name self.sampler = None def fit(self): """Transform the data and model the database.""" data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() # transform data self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows): """Wrapper for Sampler.sample_rows.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows) def sample_table(self, table_name): """Wrapper for Sampler.sample_table.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name) def sample_all(self, num_rows=5): """Wrapper for Sampler.sample_all.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows) def save(self, filename): """Save SDV instance to file destination. Args: file_destination (string): path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
def test_model_database_gaussian_copula_single_table(self): """model_database can model a single table using the gausian copula model.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=GaussianMultivariate) # Setup - Mocks - DataNavigator table_data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) table_metadata = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(table_data, table_metadata) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} data_navigator.ht = MagicMock() data_navigator.ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } # Run modeler.model_database() # Check assert 'table_name' in modeler.models sampler = Sampler(data_navigator, modeler) samples = sampler.sample_all() assert 'table_name' in samples
class SDV: """Class to do modeling and sampling all in one. Args: meta_file_name (str): Path to the metadata file. data_loader_type (str): model (type): Class of model to use. distribution (type): Class of distribution to use. Will be deprecated shortly. model_kwargs (dict): Keyword arguments to pass to model. """ def __init__(self, meta_file_name, data_loader_type='csv', model=DEFAULT_MODEL, distribution=None, model_kwargs=None): self.meta_file_name = meta_file_name self.sampler = None self.model = model self.distribution = distribution self.model_kwargs = model_kwargs def _check_unsupported_dataset_structure(self): """Checks that no table has two parents.""" tables = self.dn.tables.keys() amount_parents = [ len(self.dn.get_parents(table)) <= 1 for table in tables ] if not all(amount_parents): raise ValueError( 'Some tables have multiple parents, which is not supported yet.' ) def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(data_navigator=self.dn, model=self.model, distribution=self.distribution, model_kwargs=self.model_kwargs) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows, reset_primary_keys=False): """Sample `num_rows` rows from the given table. Args: table_name(str): Name of the table to sample from. num_rows(int): Amount of rows to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows, reset_primary_keys=reset_primary_keys) def sample_table(self, table_name, reset_primary_keys=False): """Samples the given table to its original size. Args: table_name (str): Table to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name, reset_primary_keys=reset_primary_keys) def sample_all(self, num_rows=5, reset_primary_keys=False): """Sample the whole dataset. Args: num_rows(int): Amount of rows to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows, reset_primary_keys=reset_primary_keys) def save(self, filename): """Save SDV instance to file destination. Args: file_destination(str): Path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) @classmethod def load(cls, filename): """Load a SDV instance from the given path. Args: filename(str): Path to load model. """ with open(filename, 'rb') as f: instance = pickle.load(f) return instance
class SDV: """Automated generative modeling and sampling tool. Allows the users to generate synthetic data after creating generative models for their data. Args: model (type): Class of the ``copula`` to use. Defaults to ``sdv.models.copulas.GaussianCopula``. model_kwargs (dict): Keyword arguments to pass to the model. Defaults to ``None``. """ sampler = None def __init__(self, model=DEFAULT_MODEL, model_kwargs=None): self.model = model if model_kwargs is None: self.model_kwargs = DEFAULT_MODEL_KWARGS.copy() else: self.model_kwargs = model_kwargs def fit(self, metadata, tables=None, root_path=None): """Fit this SDV instance to the dataset data. Args: metadata (dict, str or Metadata): Metadata dict, path to the metadata JSON file or Metadata instance itself. tables (dict): Dictionary with the table names as key and ``pandas.DataFrame`` instances as values. If ``None`` is given, the tables will be loaded from the paths indicated in ``metadata``. Defaults to ``None``. root_path (str or None): Path to the dataset directory. If ``None`` and metadata is a path, the metadata location is used. If ``None`` and metadata is a dict, the current working directory is used. """ if isinstance(metadata, Metadata): self.metadata = metadata else: self.metadata = Metadata(metadata, root_path) self.metadata.validate(tables) self.modeler = Modeler(self.metadata, self.model, self.model_kwargs) self.modeler.model_database(tables) self.sampler = Sampler(self.metadata, self.modeler.models, self.model, self.model_kwargs) def sample(self, table_name, num_rows, sample_children=True, reset_primary_keys=False): """Sample ``num_rows`` rows from the indicated table. Args: table_name (str): Name of the table to sample from. num_rows (int): Amount of rows to sample. sample_children (bool): Whether or not to sample children tables. Defaults to ``True``. reset_primary_keys (bool): Wheter or not reset the primary key generators. Defaults to ``False``. Returns: pandas.DataFrame: Sampled data with the number of rows specified in ``num_rows``. Raises: NotFittedError: A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample(table_name, num_rows, sample_children=sample_children, reset_primary_keys=reset_primary_keys) def sample_all(self, num_rows=5, reset_primary_keys=False): """Sample the entire dataset. Args: num_rows (int): Amount of rows to sample. Defaults to ``5``. reset_primary_keys (bool): Wheter or not reset the primary key generators. Defaults to ``False``. Returns: dict: Tables sampled. Raises: NotFittedError: A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows, reset_primary_keys=reset_primary_keys) def save(self, path): """Save this SDV instance to the given path using pickle. Args: path (str): Path where the SDV instance will be serialized. """ with open(path, 'wb') as output: pickle.dump(self, output) @classmethod def load(cls, path): """Load a SDV instance from a given path. Args: path (str): Path from which to load the SDV instance. """ with open(path, 'rb') as f: return pickle.load(f)
class SDV: """Class to do modeling and sampling all in one. Args: meta_file_name (str): Path to the metadata file. data_loader_type (str) """ def __init__(self, meta_file_name, data_loader_type='csv'): self.meta_file_name = meta_file_name self.sampler = None def _check_unsupported_dataset_structure(self): """Checks that no table has two parents.""" tables = self.dn.tables.keys() amount_parents = [ len(self.dn.get_parents(table)) <= 1 for table in tables ] if not all(amount_parents): raise ValueError( 'Some tables have multiple parents, which is not supported yet.' ) def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows): """Sample `num_rows` rows from the given table. Args: table_name(str): Name of the table to sample from. num_rows(int): Amount of rows to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows) def sample_table(self, table_name): """Samples the given table to its original size. Args: table_name (str): Table to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name) def sample_all(self, num_rows=5): """Sample the whole dataset. Args: num_rows (int): Amount of rows to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows) def save(self, filename): """Save SDV instance to file destination. Args: file_destination (string): path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
class TestSampler(TestCase): @classmethod def setUpClass(cls): data_loader = CSVDataLoader('tests/data/meta.json') cls.data_navigator = data_loader.load_data() cls.data_navigator.transform_data() cls.modeler = Modeler(cls.data_navigator) cls.modeler.model_database() def setUp(self): self.sampler = Sampler(self.data_navigator, self.modeler) def test_sample_rows_parent_table(self): """sample_rows samples new rows for the given table.""" # Setup raw_data = self.modeler.dn.tables['DEMO_CUSTOMERS'].data # Run result = self.sampler.sample_rows('DEMO_CUSTOMERS', 5) # Check assert result.shape[0] == 5 assert (result.columns == raw_data.columns).all() # Primary key columns are sampled values assert len(result['CUSTOMER_ID'].unique()) != 1 def test_sample_rows_children_table(self): """sample_rows samples new rows for the given table.""" # Setup raw_data = self.modeler.dn.tables['DEMO_ORDERS'].data # Sampling parent table. self.sampler.sample_rows('DEMO_CUSTOMERS', 5) # Run result = self.sampler.sample_rows('DEMO_ORDERS', 5) # Check assert result.shape[0] == 5 assert (result.columns == raw_data.columns).all() # Foreign key columns are all the same unique_foreign_keys = result['CUSTOMER_ID'].unique() sampled_parent = self.sampler.sampled['DEMO_CUSTOMERS'][0][1] assert len(unique_foreign_keys) == 1 assert unique_foreign_keys[0] in sampled_parent['CUSTOMER_ID'].values def test_sample_all(self): """Check sample_all and returns some value.""" # Run result = self.sampler.sample_all(num_rows=5) # Check assert result.keys() == self.sampler.dn.tables.keys() for name, table in result.items(): with self.subTest(table=name): raw_data = self.modeler.dn.tables[name].data assert (table.columns == raw_data.columns).all() if not self.sampler.dn.get_parents(name): primary_key = self.sampler.dn.get_meta_data( name)['primary_key'] assert len(table) == 5 assert len(table[primary_key].unique()) == 5