Exemple #1
0
    def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=None, model_kwargs=None):
        """Instantiates a modeler object.

        """
        self.tables = {}
        self.models = {}
        self.child_locs = {}  # maps table->{child: col #}
        self.dn = data_navigator
        self.model = model

        if distribution and model != DEFAULT_MODEL:
            raise ValueError(
                '`distribution` argument is only suported for `GaussianMultivariate` model.')

        if distribution:
            distribution = get_qualified_name(distribution)
        else:
            distribution = get_qualified_name(DEFAULT_DISTRIBUTION)

        if not model_kwargs:
            if model == DEFAULT_MODEL:
                model_kwargs = {'distribution': distribution}

            else:
                model_kwargs = {'vine_type': TreeTypes.REGULAR}

        self.model_kwargs = model_kwargs
Exemple #2
0
    def _make_model_from_params(self, parent_row, table_name, parent_name):
        """ Takes the params from a generated parent row and creates a model from it.

        Args:
            parent_row (dataframe): a generated parent row
            table_name (string): name of table to make model for
            parent_name (string): name of parent table
        """
        # get parameters
        child_range = self.modeler.child_locs.get(parent_name, {}).get(table_name, {})

        if not child_range:
            return None

        param_indices = list(range(child_range[0], child_range[1]))
        params = parent_row.loc[:, param_indices]
        totalcols = params.shape[1]
        num_cols = self.modeler.tables[table_name].shape[1]

        # get labels for dataframe
        labels = list(self.modeler.tables[table_name].columns)

        # parent_meta = self.dn.tables[parent_name].meta
        # fk = parent_meta['primary_key']

        # if fk in labels:
        #     labels.remove(fk)
        #     num_cols -= 1

        cov_size = num_cols ** 2

        # Covariance matrix
        covariance = params.iloc[:, 0:cov_size]
        covariance = covariance.values.reshape((num_cols, num_cols))

        # Distributions
        distributions = {}
        for label_index, i in enumerate(range(cov_size, totalcols, 2)):
            distributions[labels[label_index]] = {
                'type': get_qualified_name(self.modeler.distribution),
                'fitted': True,
                'std': abs(params.iloc[:, i]),  # Pending for issue
                'mean': params.iloc[:, i + 1],  # https://github.com/HDI-Project/SDV/issues/58
            }

        model_params = {
            'covariance': covariance,
            'distribs': distributions,
            'type': get_qualified_name(self.modeler.model),
            'fitted': True,
            'distribution': get_qualified_name(self.modeler.distribution)
        }

        return self.modeler.model.from_dict(model_params)
Exemple #3
0
    def unflatten_model(self, parent_row, table_name, parent_name):
        """ Takes the params from a generated parent row and creates a model from it.

        Args:
            parent_row (dataframe): a generated parent row
            table_name (string): name of table to make model for
            parent_name (string): name of parent table
        """

        prefix = '__{}__'.format(table_name)
        columns = [
            column for column in parent_row.columns
            if column.startswith(prefix)
        ]
        new_columns = {
            column: column.replace(prefix, '')
            for column in columns
        }
        flat_parameters = parent_row.loc[:, columns]
        flat_parameters = flat_parameters.rename(
            columns=new_columns).to_dict('records')[0]

        model_parameters = self._unflatten_dict(flat_parameters, table_name)
        model_name = get_qualified_name(self.modeler.model)

        model_parameters['fitted'] = True
        model_parameters['type'] = model_name

        if model_name == GAUSSIAN_COPULA:
            model_parameters = self._unflatten_gaussian_copula(
                model_parameters)

        return self.modeler.model.from_dict(model_parameters)
Exemple #4
0
    def flatten_model(self, model, name=''):
        """Flatten a model's parameters into an array.

        Args:
            model(self.model): Instance of model.
            name (str): Prefix to the parameter name.

        Returns:
            pd.Series: parameters for model
        """
        if self.model == DEFAULT_MODEL:
            values = []
            triangle = np.tril(model.covariance)

            for index, row in enumerate(triangle.tolist()):
                values.append(row[:index + 1])

            model.covariance = np.array(values)
            if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION):
                transformer = PositiveNumberTransformer({
                    'name': 'field',
                    'type': 'number'
                })

                for distribution in model.distribs.values():
                    column = pd.DataFrame({'field': [distribution.std]})
                    distribution.std = transformer.reverse_transform(column).loc[0, 'field']

        return pd.Series(self._flatten_dict(model.to_dict(), name))
Exemple #5
0
    def to_dict(self):
        result = {
            'type': get_qualified_name(self),
            'vine_type': self.vine_type,
            'fitted': self.fitted
        }

        if not self.fitted:
            return result

        result.update({
            'n_sample':
            self.n_sample,
            'n_var':
            self.n_var,
            'depth':
            self.depth,
            'truncated':
            self.truncated,
            'trees': [tree.to_dict() for tree in self.trees],
            'tau_mat':
            self.tau_mat.tolist(),
            'u_matrix':
            self.u_matrix.tolist(),
            'unis': [distribution.to_dict() for distribution in self.unis],
        })
        return result
Exemple #6
0
    def to_dict(self):
        """Return a `dict` with the parameters to replicate this Vine.

        Returns:
            dict:
                Parameters of this Vine.
        """
        result = {
            'type': get_qualified_name(self),
            'vine_type': self.vine_type,
            'fitted': self.fitted
        }

        if not self.fitted:
            return result

        result.update({
            'n_sample':
            self.n_sample,
            'n_var':
            self.n_var,
            'depth':
            self.depth,
            'truncated':
            self.truncated,
            'trees': [tree.to_dict() for tree in self.trees],
            'tau_mat':
            self.tau_mat.tolist(),
            'u_matrix':
            self.u_matrix.tolist(),
            'unis': [distribution.to_dict() for distribution in self.unis],
            'columns':
            self.columns
        })
        return result
Exemple #7
0
    def to_dict(self):
        """Returns parameters to replicate the distribution."""
        result = {'type': get_qualified_name(self), 'fitted': self.fitted}

        if not self.fitted:
            return result

        if get_qualified_name(self) == get_qualified_name(Univariate):
            if self.constant_value is not None:
                result["constant_value"] = self.constant_value
                return result
            else:
                result['instance_type'] = get_qualified_name(self._instance)

        result.update(self._fit_params())
        return result
Exemple #8
0
    def _get_model_dict(self, data):
        """Fit and  serialize  a model and flatten its parameters into an array.

        Args:
            data(pandas.DataFrame): Dataset to fit the model to.

        Returns:
            dict: Flattened parameters for model.

        """
        model = self.fit_model(data)

        if self.model == DEFAULT_MODEL:
            values = []
            triangle = np.tril(model.covariance)

            for index, row in enumerate(triangle.tolist()):
                values.append(row[:index + 1])

            model.covariance = np.array(values)
            if self.model_kwargs['distribution'] == get_qualified_name(
                    DEFAULT_DISTRIBUTION):
                transformer = PositiveNumberTransformer({
                    'name': 'field',
                    'type': 'number'
                })

                for distribution in model.distribs.values():
                    column = pd.DataFrame({'field': [distribution.std]})
                    distribution.std = transformer.reverse_transform(
                        column).loc[0, 'field']

        return self._flatten_dict(model.to_dict())
Exemple #9
0
    def to_dict(self):
        """Return a `dict` with the parameters to replicate this Tree.

        Returns:
            dict:
                Parameters of this Tree.
        """
        fitted = self.fitted
        result = {
            'tree_type': self.tree_type,
            'type': get_qualified_name(self),
            'fitted': fitted
        }

        if not fitted:
            return result

        result.update({
            'level': self.level,
            'n_nodes': self.n_nodes,
            'tau_matrix': self.tau_matrix.tolist(),
            'previous_tree': self._serialize_previous_tree(),
            'edges': [edge.to_dict() for edge in self.edges],
        })

        return result
Exemple #10
0
    def test_fit_distribution_selector(self):
        """
        On fit, it should use the correct distributions for those that are
        specified and default to using the base class otherwise.
        """
        copula = GaussianMultivariate(distribution={
            'column1': 'copulas.univariate.beta.BetaUnivariate',
            'column2': 'copulas.univariate.gaussian_kde.GaussianKDE',
        })
        copula.fit(self.data)

        assert get_qualified_name(
            copula.univariates[0].__class__) == 'copulas.univariate.beta.BetaUnivariate'
        assert get_qualified_name(
            copula.univariates[1].__class__) == 'copulas.univariate.gaussian_kde.GaussianKDE'
        assert get_qualified_name(
            copula.univariates[2].__class__) == 'copulas.univariate.base.Univariate'
Exemple #11
0
 def from_dict(cls, param_dict):
     """Create new instance from dictionary."""
     distribution_class = get_instance(param_dict['type'])
     if get_qualified_name(distribution_class) == get_qualified_name(
             Univariate):
         distribution_class.fitted = param_dict['fitted']
         if distribution_class.fitted:
             if param_dict.get("constant_value", None) is not None:
                 distribution_class.constant_value = param_dict[
                     "constant_value"]
                 distribution_class._replace_constant_methods()
             else:
                 instance_class = get_instance(param_dict['instance_type'])
                 distribution_class._instance = instance_class.from_dict(
                     param_dict)
         return distribution_class
     return distribution_class.from_dict(param_dict)
Exemple #12
0
    def to_dict(self):
        """Returns parameters to replicate the distribution."""
        result = {'type': get_qualified_name(self), 'fitted': self.fitted}

        if not self.fitted:
            return result

        result.update(self._fit_params())
        return result
Exemple #13
0
    def to_dict(self):
        """Return the parameters of this model in a dict.

        Returns:
            dict:
                Dictionary containing the distribution type and all
                the parameters that define the distribution.

        Raises:
            NotFittedError:
                if the model is not fitted.
        """
        self.check_fit()

        params = self._get_params()
        if self.__class__ is Univariate:
            params['type'] = get_qualified_name(self._instance)
        else:
            params['type'] = get_qualified_name(self)

        return params
Exemple #14
0
    def _get_model(self, extension):
        """Build a model using the extension parameters."""
        model_parameters = self._unflatten_dict(extension)
        model_name = get_qualified_name(self.modeler.model)

        model_parameters['fitted'] = True
        model_parameters['type'] = model_name

        if model_name == GAUSSIAN_COPULA:
            model_parameters = self._unflatten_gaussian_copula(model_parameters)

        return self.modeler.model.from_dict(model_parameters)
Exemple #15
0
    def to_dict(self):
        distributions = {
            name: distribution.to_dict() for name, distribution in self.distribs.items()
        }

        return {
            'covariance': self.covariance.tolist(),
            'distribs': distributions,
            'type': get_qualified_name(self),
            'fitted': self.fitted,
            'distribution': self.distribution
        }
Exemple #16
0
    def to_dict(self):
        univariates = [univariate.to_dict() for univariate in self.univariates]
        distribution = self.distribution
        if isinstance(self.distribution, dict):
            distribution = {}
            for k, v in self.distribution.items():
                distribution[k] = v.to_dict()

        return {
            'covariance': self.covariance.tolist(),
            'univariates': univariates,
            'columns': self.columns,
            'type': get_qualified_name(self),
            'fitted': self.fitted,
            'distribution': distribution
        }
Exemple #17
0
    def to_dict(self):
        """Return a `dict` with the parameters to replicate this object.

        Returns:
            dict:
                Parameters of this distribution.
        """
        self.check_fit()
        univariates = [univariate.to_dict() for univariate in self.univariates]

        return {
            'covariance': self.covariance.tolist(),
            'univariates': univariates,
            'columns': self.columns,
            'type': get_qualified_name(self),
        }
Exemple #18
0
    def to_dict(self):
        """Return a `dict` with the parameters to replicate this object.

        Returns:
            dict:
                Parameters of this distribution.
        """
        self.check_fit()
        univariates = [univariate.to_dict() for univariate in self.univariates]
        warnings.warn('`covariance` will be renamed to `correlation` in v0.4.0',
                      DeprecationWarning)

        return {
            'covariance': self.covariance.tolist(),
            'univariates': univariates,
            'columns': self.columns,
            'type': get_qualified_name(self),
        }
Exemple #19
0
    def test_fit_distribution_arg(self):
        """On fit, the distributions for each column use instances of copula.distribution."""
        # Setup
        distribution = 'copulas.univariate.gaussian_kde.GaussianKDE'
        copula = GaussianMultivariate(distribution=distribution)

        # Run
        copula.fit(self.data)

        # Check
        assert copula.distribution == 'copulas.univariate.gaussian_kde.GaussianKDE'

        for i, key in enumerate(self.data.columns):
            assert copula.columns[i] == key
            assert get_qualified_name(copula.univariates[i].__class__) == copula.distribution

        expected_covariance = copula._get_covariance(self.data)
        assert (copula.covariance == expected_covariance).all().all()
Exemple #20
0
    def test_fit_distribution_arg(self):
        """On fit, the distributions for each column use instances of copula.distribution."""
        # Setup
        distribution = 'copulas.univariate.kde.KDEUnivariate'
        copula = GaussianMultivariate(distribution=distribution)

        # Run
        copula.fit(self.data)

        # Check
        assert copula.distribution == 'copulas.univariate.kde.KDEUnivariate'

        for key in self.data.columns:
            assert key in copula.distribs
            assert get_qualified_name(
                copula.distribs[key].__class__) == copula.distribution

        expected_covariance = copula._get_covariance(self.data)
        assert (copula.covariance == expected_covariance).all().all()
Exemple #21
0
    def _sample_model(model, num_rows, columns):
        """Sample from model and format into pandas.DataFrame.

        Args:
            model(copula.multivariate.base): Fitted model.
            num_rows(int): Number of rows to sample.
            columns(Iterable): Column names for the sampled rows.

        Returns:
            pd.DataFrame: Sampled rows.

        """
        if get_qualified_name(model) == 'copulas.multivariate.vine.VineCopula':
            synthesized = [model.sample(num_rows).tolist() for row in range(num_rows)]

        else:
            synthesized = model.sample(num_rows)

        return pd.DataFrame(synthesized, columns=columns)
Exemple #22
0
    def to_dict(self):
        fitted = self.fitted
        result = {
            'tree_type': self.tree_type,
            'type': get_qualified_name(self),
            'fitted': fitted
        }

        if not fitted:
            return result

        result.update({
            'level': self.level,
            'n_nodes': self.n_nodes,
            'tau_matrix': self.tau_matrix.tolist(),
            'previous_tree': self._serialize_previous_tree(),
            'edges': [edge.to_dict() for edge in self.edges],
        })

        return result
    def test_fit_default_distribution(self):
        """On fit, a distribution is created for each column along the covariance and means"""

        # Setup
        copula = GaussianMultivariate()

        # Run
        copula.fit(self.data)

        # Check
        assert copula.distribution == 'copulas.univariate.gaussian.GaussianUnivariate'

        for key in self.data.columns:
            assert key in copula.distribs
            assert get_qualified_name(
                copula.distribs[key].__class__) == copula.distribution
            assert copula.distribs[key].mean == self.data[key].mean()
            assert copula.distribs[key].std == np.std(self.data[key])

        expected_covariance = copula._get_covariance(self.data)
        assert (copula.covariance == expected_covariance).all().all()