Example #1
0
    def test_sample_all(self):
        """Test sample all regenerating the primary keys"""

        # Setup
        def sample_side_effect(table, num_rows):
            return {table: pd.DataFrame({'foo': range(num_rows)})}

        sampler = Mock(spec=Sampler)
        sampler.metadata.get_tables.return_value = [
            'table a', 'table b', 'table c'
        ]
        sampler.metadata.get_parents.side_effect = [False, True, False]
        sampler.sample.side_effect = sample_side_effect

        # Run
        result = Sampler.sample_all(sampler,
                                    num_rows=3,
                                    reset_primary_keys=True)

        # Asserts
        assert sampler.metadata.get_parents.call_count == 3
        assert sampler._reset_primary_keys_generators.call_count == 1
        pd.testing.assert_frame_equal(result['table a'],
                                      pd.DataFrame({'foo': range(3)}))
        pd.testing.assert_frame_equal(result['table c'],
                                      pd.DataFrame({'foo': range(3)}))
Example #2
0
    def test_sample_all(self, rows_mock):
        """Check sample_all and returns some value."""
        # Setup
        data_navigator = MagicMock()
        data_navigator.tables = ['TABLE_A', 'TABLE_B']
        data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A'
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        def fake_dataframe(*args, **kwargs):
            kwargs['sampled_data'][args[1]] = 'sampled_data'

        rows_mock.side_effect = fake_dataframe

        expected_get_parents_call_list = [(('TABLE_A', ), {}),
                                          (('TABLE_B', ), {})]
        expected_result = {'TABLE_A': 'sampled_data'}

        # Run
        result = sampler.sample_all(num_rows=5)

        # Check
        assert result == expected_result

        assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list
        rows_mock.assert_called_once_with(
            sampler, 'TABLE_A', 5, sampled_data={'TABLE_A': 'sampled_data'})
Example #3
0
    def test_sample_all(self, rows_mock, child_mock, reset_mock, concat_mock):
        """Check sample_all and returns some value."""
        # Setup
        data_navigator = MagicMock()
        data_navigator.tables = ['TABLE_A', 'TABLE_B']
        data_navigator.get_parents.side_effect = lambda x: x != 'TABLE_A'
        modeler = MagicMock()
        sampler = Sampler(data_navigator, modeler)

        def fake_dataframe(name, number):
            return pd.DataFrame([{name: 0} for i in range(number)], index=[0]*number)

        rows_mock.side_effect = fake_dataframe
        concat_mock.return_value = 'concatenated_dataframe'

        expected_get_parents_call_list = [(('TABLE_A',), {}), (('TABLE_B',), {})]
        expected_rows_mock_call_list = [(('TABLE_A', 1), {}) for i in range(5)]

        # Run
        result = sampler.sample_all(num_rows=5)

        # Check
        assert data_navigator.get_parents.call_args_list == expected_get_parents_call_list
        assert result == reset_mock.return_value

        assert rows_mock.call_args_list == expected_rows_mock_call_list
        assert child_mock.call_count == 5
        reset_mock.assert_called_once_with({'TABLE_A': 'concatenated_dataframe'})
Example #4
0
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(data, meta)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name': pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        data_navigator.meta = {
            'tables': [
                {
                    'name': meta
                }
            ]
        }
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples
Example #5
0
    def test_sample_all_with_reset_primary_key(self):
        """Check sample_all with reset_primary_keys True"""

        # Setup
        reset_primary_keys_generators_mock = Mock()

        dn_mock = Mock()
        dn_mock.tables = {'DEMO': Table(pd.DataFrame(), {'some': 'meta'})}
        dn_mock.get_parents.return_value = True

        # Run
        sampler_mock = Mock()
        sampler_mock._reset_primary_keys_generators = reset_primary_keys_generators_mock
        sampler_mock.dn = dn_mock

        Sampler.sample_all(sampler_mock, reset_primary_keys=True)

        # Asserts
        reset_primary_keys_generators_mock.assert_called_once_with()
Example #6
0
class SDV:
    """Class to do modeling and sampling all in one."""
    def __init__(self, meta_file_name, data_loader_type='csv'):
        """Initialize sdv class."""
        self.meta_file_name = meta_file_name
        self.sampler = None

    def fit(self):
        """Transform the data and model the database."""
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()
        # transform data
        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Wrapper for Sampler.sample_rows."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Wrapper for Sampler.sample_table."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Wrapper for Sampler.sample_all."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')
        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
Example #7
0
    def test_model_database_gaussian_copula_single_table(self):
        """model_database can model a single table using the gausian copula model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator,
                          model=GaussianMultivariate)

        # Setup - Mocks - DataNavigator
        table_data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        table_metadata = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(table_data, table_metadata)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}
        data_navigator.ht = MagicMock()
        data_navigator.ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        # Run
        modeler.model_database()

        # Check
        assert 'table_name' in modeler.models

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_all()
        assert 'table_name' in samples
Example #8
0
File: sdv.py Project: ush19/SDV
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str):
        model (type): Class of model to use.
        distribution (type): Class of distribution to use. Will be deprecated shortly.
        model_kwargs (dict): Keyword arguments to pass to model.

    """
    def __init__(self,
                 meta_file_name,
                 data_loader_type='csv',
                 model=DEFAULT_MODEL,
                 distribution=None,
                 model_kwargs=None):
        self.meta_file_name = meta_file_name
        self.sampler = None
        self.model = model
        self.distribution = distribution
        self.model_kwargs = model_kwargs

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(data_navigator=self.dn,
                               model=self.model,
                               distribution=self.distribution,
                               model_kwargs=self.model_kwargs)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows, reset_primary_keys=False):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name,
                                        num_rows,
                                        reset_primary_keys=reset_primary_keys)

    def sample_table(self, table_name, reset_primary_keys=False):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name,
                                         reset_primary_keys=reset_primary_keys)

    def sample_all(self, num_rows=5, reset_primary_keys=False):
        """Sample the whole dataset.

        Args:
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows,
                                       reset_primary_keys=reset_primary_keys)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination(str): Path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

    @classmethod
    def load(cls, filename):
        """Load a SDV instance from the given path.

        Args:
            filename(str): Path to load model.

        """
        with open(filename, 'rb') as f:
            instance = pickle.load(f)

        return instance
Example #9
0
class SDV:
    """Automated generative modeling and sampling tool.

    Allows the users to generate synthetic data after creating generative models for their data.

    Args:
        model (type):
            Class of the ``copula`` to use. Defaults to
            ``sdv.models.copulas.GaussianCopula``.
        model_kwargs (dict):
            Keyword arguments to pass to the model. Defaults to ``None``.
    """

    sampler = None

    def __init__(self, model=DEFAULT_MODEL, model_kwargs=None):
        self.model = model
        if model_kwargs is None:
            self.model_kwargs = DEFAULT_MODEL_KWARGS.copy()
        else:
            self.model_kwargs = model_kwargs

    def fit(self, metadata, tables=None, root_path=None):
        """Fit this SDV instance to the dataset data.

        Args:
            metadata (dict, str or Metadata):
                Metadata dict, path to the metadata JSON file or Metadata instance itself.
            tables (dict):
                Dictionary with the table names as key and ``pandas.DataFrame`` instances as
                values.  If ``None`` is given, the tables will be loaded from the paths
                indicated in ``metadata``. Defaults to ``None``.
            root_path (str or None):
                Path to the dataset directory. If ``None`` and metadata is
                a path, the metadata location is used. If ``None`` and
                metadata is a dict, the current working directory is used.
        """

        if isinstance(metadata, Metadata):
            self.metadata = metadata
        else:
            self.metadata = Metadata(metadata, root_path)

        self.metadata.validate(tables)

        self.modeler = Modeler(self.metadata, self.model, self.model_kwargs)
        self.modeler.model_database(tables)
        self.sampler = Sampler(self.metadata, self.modeler.models, self.model,
                               self.model_kwargs)

    def sample(self,
               table_name,
               num_rows,
               sample_children=True,
               reset_primary_keys=False):
        """Sample ``num_rows`` rows from the indicated table.

        Args:
            table_name (str):
                Name of the table to sample from.
            num_rows (int):
                Amount of rows to sample.
            sample_children (bool):
                Whether or not to sample children tables. Defaults to ``True``.
            reset_primary_keys (bool):
                Wheter or not reset the primary key generators. Defaults to ``False``.

        Returns:
            pandas.DataFrame:
                Sampled data with the number of rows specified in ``num_rows``.

        Raises:
            NotFittedError:
                A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample(table_name,
                                   num_rows,
                                   sample_children=sample_children,
                                   reset_primary_keys=reset_primary_keys)

    def sample_all(self, num_rows=5, reset_primary_keys=False):
        """Sample the entire dataset.

        Args:
            num_rows (int):
                Amount of rows to sample. Defaults to ``5``.
            reset_primary_keys (bool):
                Wheter or not reset the primary key generators. Defaults to ``False``.

        Returns:
            dict:
                Tables sampled.

        Raises:
            NotFittedError:
                A ``NotFittedError`` is raised when the ``SDV`` instance has not been fitted yet.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows,
                                       reset_primary_keys=reset_primary_keys)

    def save(self, path):
        """Save this SDV instance to the given path using pickle.

        Args:
            path (str):
                Path where the SDV instance will be serialized.
        """
        with open(path, 'wb') as output:
            pickle.dump(self, output)

    @classmethod
    def load(cls, path):
        """Load a SDV instance from a given path.

        Args:
            path (str):
                Path from which to load the SDV instance.
        """
        with open(path, 'rb') as f:
            return pickle.load(f)
Example #10
0
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str)
    """
    def __init__(self, meta_file_name, data_loader_type='csv'):
        self.meta_file_name = meta_file_name
        self.sampler = None

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Sample the whole dataset.

        Args:
            num_rows (int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
Example #11
0
class TestSampler(TestCase):
    @classmethod
    def setUpClass(cls):
        data_loader = CSVDataLoader('tests/data/meta.json')
        cls.data_navigator = data_loader.load_data()
        cls.data_navigator.transform_data()

        cls.modeler = Modeler(cls.data_navigator)
        cls.modeler.model_database()

    def setUp(self):
        self.sampler = Sampler(self.data_navigator, self.modeler)

    def test_sample_rows_parent_table(self):
        """sample_rows samples new rows for the given table."""
        # Setup
        raw_data = self.modeler.dn.tables['DEMO_CUSTOMERS'].data

        # Run
        result = self.sampler.sample_rows('DEMO_CUSTOMERS', 5)

        # Check
        assert result.shape[0] == 5
        assert (result.columns == raw_data.columns).all()

        # Primary key columns are sampled values
        assert len(result['CUSTOMER_ID'].unique()) != 1

    def test_sample_rows_children_table(self):
        """sample_rows samples new rows for the given table."""
        # Setup
        raw_data = self.modeler.dn.tables['DEMO_ORDERS'].data
        # Sampling parent table.
        self.sampler.sample_rows('DEMO_CUSTOMERS', 5)

        # Run
        result = self.sampler.sample_rows('DEMO_ORDERS', 5)

        # Check
        assert result.shape[0] == 5
        assert (result.columns == raw_data.columns).all()

        # Foreign key columns are all the same
        unique_foreign_keys = result['CUSTOMER_ID'].unique()
        sampled_parent = self.sampler.sampled['DEMO_CUSTOMERS'][0][1]
        assert len(unique_foreign_keys) == 1
        assert unique_foreign_keys[0] in sampled_parent['CUSTOMER_ID'].values

    def test_sample_all(self):
        """Check sample_all and returns some value."""

        # Run
        result = self.sampler.sample_all(num_rows=5)

        # Check
        assert result.keys() == self.sampler.dn.tables.keys()

        for name, table in result.items():
            with self.subTest(table=name):
                raw_data = self.modeler.dn.tables[name].data
                assert (table.columns == raw_data.columns).all()

                if not self.sampler.dn.get_parents(name):
                    primary_key = self.sampler.dn.get_meta_data(
                        name)['primary_key']
                    assert len(table) == 5
                    assert len(table[primary_key].unique()) == 5