Esempio n. 1
0
    def test_sample_table(self, rows_mock):
        """ """
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        data_navigator.tables = {
            'table': MagicMock(**{'data.shape': ('rows', 'columns')})
        }
        modeler = MagicMock(spec=Modeler)
        sampler = Sampler(data_navigator=data_navigator, modeler=modeler)

        rows_mock.return_value = {'table': 'samples'}

        table_name = 'table'
        reset_primary_keys = False

        expected_result = 'samples'

        # Run
        result = sampler.sample_table(table_name,
                                      reset_primary_keys=reset_primary_keys)

        # Check
        assert result == expected_result

        rows_mock.assert_called_once_with(sampler,
                                          'table',
                                          'rows',
                                          sample_children=False,
                                          reset_primary_keys=False)
Esempio n. 2
0
class SDV:
    """Class to do modeling and sampling all in one."""
    def __init__(self, meta_file_name, data_loader_type='csv'):
        """Initialize sdv class."""
        self.meta_file_name = meta_file_name
        self.sampler = None

    def fit(self):
        """Transform the data and model the database."""
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()
        # transform data
        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Wrapper for Sampler.sample_rows."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Wrapper for Sampler.sample_table."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Wrapper for Sampler.sample_all."""
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')
        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 3
0
File: sdv.py Progetto: ush19/SDV
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str):
        model (type): Class of model to use.
        distribution (type): Class of distribution to use. Will be deprecated shortly.
        model_kwargs (dict): Keyword arguments to pass to model.

    """
    def __init__(self,
                 meta_file_name,
                 data_loader_type='csv',
                 model=DEFAULT_MODEL,
                 distribution=None,
                 model_kwargs=None):
        self.meta_file_name = meta_file_name
        self.sampler = None
        self.model = model
        self.distribution = distribution
        self.model_kwargs = model_kwargs

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(data_navigator=self.dn,
                               model=self.model,
                               distribution=self.distribution,
                               model_kwargs=self.model_kwargs)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows, reset_primary_keys=False):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name,
                                        num_rows,
                                        reset_primary_keys=reset_primary_keys)

    def sample_table(self, table_name, reset_primary_keys=False):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name,
                                         reset_primary_keys=reset_primary_keys)

    def sample_all(self, num_rows=5, reset_primary_keys=False):
        """Sample the whole dataset.

        Args:
            num_rows(int): Amount of rows to sample.
            reset_primary_keys(bool): Wheter or not reset the pk generators.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows,
                                       reset_primary_keys=reset_primary_keys)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination(str): Path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

    @classmethod
    def load(cls, filename):
        """Load a SDV instance from the given path.

        Args:
            filename(str): Path to load model.

        """
        with open(filename, 'rb') as f:
            instance = pickle.load(f)

        return instance
Esempio n. 4
0
class SDV:
    """Class to do modeling and sampling all in one.

    Args:
        meta_file_name (str): Path to the metadata file.
        data_loader_type (str)
    """
    def __init__(self, meta_file_name, data_loader_type='csv'):
        self.meta_file_name = meta_file_name
        self.sampler = None

    def _check_unsupported_dataset_structure(self):
        """Checks that no table has two parents."""
        tables = self.dn.tables.keys()
        amount_parents = [
            len(self.dn.get_parents(table)) <= 1 for table in tables
        ]
        if not all(amount_parents):
            raise ValueError(
                'Some tables have multiple parents, which is not supported yet.'
            )

    def fit(self):
        """Transform the data and model the database.

        Raises:
            ValueError: If the provided dataset has an unsupported structure.
        """
        data_loader = CSVDataLoader(self.meta_file_name)
        self.dn = data_loader.load_data()

        self._check_unsupported_dataset_structure()

        self.dn.transform_data()
        self.modeler = Modeler(self.dn)
        self.modeler.model_database()
        self.sampler = Sampler(self.dn, self.modeler)

    def sample_rows(self, table_name, num_rows):
        """Sample `num_rows` rows from the given table.

        Args:
            table_name(str): Name of the table to sample from.
            num_rows(int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_rows(table_name, num_rows)

    def sample_table(self, table_name):
        """Samples the given table to its original size.

        Args:
            table_name (str): Table to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_table(table_name)

    def sample_all(self, num_rows=5):
        """Sample the whole dataset.

        Args:
            num_rows (int): Amount of rows to sample.
        """
        if self.sampler is None:
            raise NotFittedError('SDV instance has not been fitted')

        return self.sampler.sample_all(num_rows)

    def save(self, filename):
        """Save SDV instance to file destination.

        Args:
            file_destination (string): path to store file.
        """
        with open(filename, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
Esempio n. 5
0
    def test_model_database_vine_modeler_single_table(self):
        """model_database works fine with vine modeler."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator, model=VineCopula)

        # Setup - Mock
        data = pd.DataFrame({'column_A': list('abdc'), 'column_B': range(4)})
        meta = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {'table_name': Table(data, meta)}
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}

        ht = MagicMock(spec=HyperTransformer)
        ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        reverse_transform_dataframe = pd.DataFrame(
            {
                'column_A': list('bcda'),
                'column_B': [1.0, 2.0, 3.0, 4.0]
            },
            columns=['column_A', 'column_B'])
        ht.reverse_transform_table.return_value = reverse_transform_dataframe

        data_navigator.ht = ht

        # Run
        modeler.model_database()

        # Check
        assert len(modeler.models) == 1
        model = modeler.models['table_name']
        assert isinstance(model, VineCopula)
        assert model.fitted is True

        assert data_navigator.get_parents.call_args_list == [
            (('table_name', ), )
        ]
        assert data_navigator.get_children.call_args_list == [
            (('table_name', ), ), (('table_name', ), )
        ]
        assert modeler.tables['table_name'].equals(
            modeler.dn.transformed_data['table_name'])

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_table('table_name')

        assert samples.equals(reverse_transform_dataframe)
Esempio n. 6
0
    def test_model_database_gaussian_copula_single_table(self):
        """model_database can model a single table using the gausian copula model."""
        # Setup
        data_navigator = MagicMock(spec=DataNavigator)
        modeler = Modeler(data_navigator=data_navigator,
                          model=GaussianMultivariate)

        # Setup - Mocks - DataNavigator
        table_data = pd.DataFrame({
            'column_A': list('abdc'),
            'column_B': range(4)
        })
        table_metadata = {
            'name': 'table_name',
            'fields': {
                'column_A': {
                    'name': 'column_A',
                    'type': 'categorical'
                },
                'column_B': {
                    'name': 'column_B',
                    'type': 'number',
                    'subtype': 'integer'
                }
            }
        }

        data_navigator.tables = {
            'table_name': Table(table_data, table_metadata)
        }
        data_navigator.get_parents.return_value = set()
        data_navigator.get_children.return_value = set()
        data_navigator.transformed_data = {
            'table_name':
            pd.DataFrame({
                'column_A': [0.1, 0.2, 0.5, 1.0],
                'column_B': range(4)
            })
        }
        metadata = {
            'name':
            'table_name',
            'fields': [{
                'name': 'column_A',
                'type': 'categorical'
            }, {
                'name': 'column_B',
                'type': 'number',
                'subtype': 'integer'
            }]
        }

        data_navigator.meta = {'tables': [metadata]}
        ht = MagicMock(spec=HyperTransformer)
        ht.transformers = {
            ('table_name', 'column_A'): None,
            ('table_name', 'column_B'): None
        }

        reverse_transform_dataframe = pd.DataFrame(
            {
                'column_A': list('bcda'),
                'column_B': [1.0, 2.0, 3.0, 4.0]
            },
            columns=['column_A', 'column_B'])
        ht.reverse_transform_table.return_value = reverse_transform_dataframe

        data_navigator.ht = ht

        # Run
        modeler.model_database()

        # Check
        assert len(modeler.models) == 1
        assert 'table_name' in modeler.models
        model = modeler.models['table_name']

        assert isinstance(model, GaussianMultivariate)
        assert model.distribution == 'copulas.univariate.gaussian.GaussianUnivariate'
        assert model.fitted is True

        assert data_navigator.get_parents.call_args_list == [
            (('table_name', ), )
        ]
        assert data_navigator.get_children.call_args_list == [
            (('table_name', ), ), (('table_name', ), )
        ]
        assert modeler.tables['table_name'].equals(
            modeler.dn.transformed_data['table_name'])

        sampler = Sampler(data_navigator, modeler)
        samples = sampler.sample_table('table_name')

        assert isinstance(samples, pd.DataFrame)
        assert samples.equals(
            sampler.dn.ht.reverse_transform_table.return_value)