def test_sample_table(self, rows_mock): """ """ # Setup data_navigator = MagicMock(spec=DataNavigator) data_navigator.tables = { 'table': MagicMock(**{'data.shape': ('rows', 'columns')}) } modeler = MagicMock(spec=Modeler) sampler = Sampler(data_navigator=data_navigator, modeler=modeler) rows_mock.return_value = {'table': 'samples'} table_name = 'table' reset_primary_keys = False expected_result = 'samples' # Run result = sampler.sample_table(table_name, reset_primary_keys=reset_primary_keys) # Check assert result == expected_result rows_mock.assert_called_once_with(sampler, 'table', 'rows', sample_children=False, reset_primary_keys=False)
class SDV: """Class to do modeling and sampling all in one.""" def __init__(self, meta_file_name, data_loader_type='csv'): """Initialize sdv class.""" self.meta_file_name = meta_file_name self.sampler = None def fit(self): """Transform the data and model the database.""" data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() # transform data self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows): """Wrapper for Sampler.sample_rows.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows) def sample_table(self, table_name): """Wrapper for Sampler.sample_table.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name) def sample_all(self, num_rows=5): """Wrapper for Sampler.sample_all.""" if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows) def save(self, filename): """Save SDV instance to file destination. Args: file_destination (string): path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
class SDV: """Class to do modeling and sampling all in one. Args: meta_file_name (str): Path to the metadata file. data_loader_type (str): model (type): Class of model to use. distribution (type): Class of distribution to use. Will be deprecated shortly. model_kwargs (dict): Keyword arguments to pass to model. """ def __init__(self, meta_file_name, data_loader_type='csv', model=DEFAULT_MODEL, distribution=None, model_kwargs=None): self.meta_file_name = meta_file_name self.sampler = None self.model = model self.distribution = distribution self.model_kwargs = model_kwargs def _check_unsupported_dataset_structure(self): """Checks that no table has two parents.""" tables = self.dn.tables.keys() amount_parents = [ len(self.dn.get_parents(table)) <= 1 for table in tables ] if not all(amount_parents): raise ValueError( 'Some tables have multiple parents, which is not supported yet.' ) def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(data_navigator=self.dn, model=self.model, distribution=self.distribution, model_kwargs=self.model_kwargs) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows, reset_primary_keys=False): """Sample `num_rows` rows from the given table. Args: table_name(str): Name of the table to sample from. num_rows(int): Amount of rows to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows, reset_primary_keys=reset_primary_keys) def sample_table(self, table_name, reset_primary_keys=False): """Samples the given table to its original size. Args: table_name (str): Table to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name, reset_primary_keys=reset_primary_keys) def sample_all(self, num_rows=5, reset_primary_keys=False): """Sample the whole dataset. Args: num_rows(int): Amount of rows to sample. reset_primary_keys(bool): Wheter or not reset the pk generators. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows, reset_primary_keys=reset_primary_keys) def save(self, filename): """Save SDV instance to file destination. Args: file_destination(str): Path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) @classmethod def load(cls, filename): """Load a SDV instance from the given path. Args: filename(str): Path to load model. """ with open(filename, 'rb') as f: instance = pickle.load(f) return instance
class SDV: """Class to do modeling and sampling all in one. Args: meta_file_name (str): Path to the metadata file. data_loader_type (str) """ def __init__(self, meta_file_name, data_loader_type='csv'): self.meta_file_name = meta_file_name self.sampler = None def _check_unsupported_dataset_structure(self): """Checks that no table has two parents.""" tables = self.dn.tables.keys() amount_parents = [ len(self.dn.get_parents(table)) <= 1 for table in tables ] if not all(amount_parents): raise ValueError( 'Some tables have multiple parents, which is not supported yet.' ) def fit(self): """Transform the data and model the database. Raises: ValueError: If the provided dataset has an unsupported structure. """ data_loader = CSVDataLoader(self.meta_file_name) self.dn = data_loader.load_data() self._check_unsupported_dataset_structure() self.dn.transform_data() self.modeler = Modeler(self.dn) self.modeler.model_database() self.sampler = Sampler(self.dn, self.modeler) def sample_rows(self, table_name, num_rows): """Sample `num_rows` rows from the given table. Args: table_name(str): Name of the table to sample from. num_rows(int): Amount of rows to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_rows(table_name, num_rows) def sample_table(self, table_name): """Samples the given table to its original size. Args: table_name (str): Table to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_table(table_name) def sample_all(self, num_rows=5): """Sample the whole dataset. Args: num_rows (int): Amount of rows to sample. """ if self.sampler is None: raise NotFittedError('SDV instance has not been fitted') return self.sampler.sample_all(num_rows) def save(self, filename): """Save SDV instance to file destination. Args: file_destination (string): path to store file. """ with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
def test_model_database_vine_modeler_single_table(self): """model_database works fine with vine modeler.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=VineCopula) # Setup - Mock data = pd.DataFrame({'column_A': list('abdc'), 'column_B': range(4)}) meta = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = {'table_name': Table(data, meta)} data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} ht = MagicMock(spec=HyperTransformer) ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } reverse_transform_dataframe = pd.DataFrame( { 'column_A': list('bcda'), 'column_B': [1.0, 2.0, 3.0, 4.0] }, columns=['column_A', 'column_B']) ht.reverse_transform_table.return_value = reverse_transform_dataframe data_navigator.ht = ht # Run modeler.model_database() # Check assert len(modeler.models) == 1 model = modeler.models['table_name'] assert isinstance(model, VineCopula) assert model.fitted is True assert data_navigator.get_parents.call_args_list == [ (('table_name', ), ) ] assert data_navigator.get_children.call_args_list == [ (('table_name', ), ), (('table_name', ), ) ] assert modeler.tables['table_name'].equals( modeler.dn.transformed_data['table_name']) sampler = Sampler(data_navigator, modeler) samples = sampler.sample_table('table_name') assert samples.equals(reverse_transform_dataframe)
def test_model_database_gaussian_copula_single_table(self): """model_database can model a single table using the gausian copula model.""" # Setup data_navigator = MagicMock(spec=DataNavigator) modeler = Modeler(data_navigator=data_navigator, model=GaussianMultivariate) # Setup - Mocks - DataNavigator table_data = pd.DataFrame({ 'column_A': list('abdc'), 'column_B': range(4) }) table_metadata = { 'name': 'table_name', 'fields': { 'column_A': { 'name': 'column_A', 'type': 'categorical' }, 'column_B': { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' } } } data_navigator.tables = { 'table_name': Table(table_data, table_metadata) } data_navigator.get_parents.return_value = set() data_navigator.get_children.return_value = set() data_navigator.transformed_data = { 'table_name': pd.DataFrame({ 'column_A': [0.1, 0.2, 0.5, 1.0], 'column_B': range(4) }) } metadata = { 'name': 'table_name', 'fields': [{ 'name': 'column_A', 'type': 'categorical' }, { 'name': 'column_B', 'type': 'number', 'subtype': 'integer' }] } data_navigator.meta = {'tables': [metadata]} ht = MagicMock(spec=HyperTransformer) ht.transformers = { ('table_name', 'column_A'): None, ('table_name', 'column_B'): None } reverse_transform_dataframe = pd.DataFrame( { 'column_A': list('bcda'), 'column_B': [1.0, 2.0, 3.0, 4.0] }, columns=['column_A', 'column_B']) ht.reverse_transform_table.return_value = reverse_transform_dataframe data_navigator.ht = ht # Run modeler.model_database() # Check assert len(modeler.models) == 1 assert 'table_name' in modeler.models model = modeler.models['table_name'] assert isinstance(model, GaussianMultivariate) assert model.distribution == 'copulas.univariate.gaussian.GaussianUnivariate' assert model.fitted is True assert data_navigator.get_parents.call_args_list == [ (('table_name', ), ) ] assert data_navigator.get_children.call_args_list == [ (('table_name', ), ), (('table_name', ), ) ] assert modeler.tables['table_name'].equals( modeler.dn.transformed_data['table_name']) sampler = Sampler(data_navigator, modeler) samples = sampler.sample_table('table_name') assert isinstance(samples, pd.DataFrame) assert samples.equals( sampler.dn.ht.reverse_transform_table.return_value)