def __init__(self, data_navigator, model=DEFAULT_MODEL, distribution=None, model_kwargs=None): """Instantiates a modeler object. """ self.tables = {} self.models = {} self.child_locs = {} # maps table->{child: col #} self.dn = data_navigator self.model = model if distribution and model != DEFAULT_MODEL: raise ValueError( '`distribution` argument is only suported for `GaussianMultivariate` model.') if distribution: distribution = get_qualified_name(distribution) else: distribution = get_qualified_name(DEFAULT_DISTRIBUTION) if not model_kwargs: if model == DEFAULT_MODEL: model_kwargs = {'distribution': distribution} else: model_kwargs = {'vine_type': TreeTypes.REGULAR} self.model_kwargs = model_kwargs
def _make_model_from_params(self, parent_row, table_name, parent_name): """ Takes the params from a generated parent row and creates a model from it. Args: parent_row (dataframe): a generated parent row table_name (string): name of table to make model for parent_name (string): name of parent table """ # get parameters child_range = self.modeler.child_locs.get(parent_name, {}).get(table_name, {}) if not child_range: return None param_indices = list(range(child_range[0], child_range[1])) params = parent_row.loc[:, param_indices] totalcols = params.shape[1] num_cols = self.modeler.tables[table_name].shape[1] # get labels for dataframe labels = list(self.modeler.tables[table_name].columns) # parent_meta = self.dn.tables[parent_name].meta # fk = parent_meta['primary_key'] # if fk in labels: # labels.remove(fk) # num_cols -= 1 cov_size = num_cols ** 2 # Covariance matrix covariance = params.iloc[:, 0:cov_size] covariance = covariance.values.reshape((num_cols, num_cols)) # Distributions distributions = {} for label_index, i in enumerate(range(cov_size, totalcols, 2)): distributions[labels[label_index]] = { 'type': get_qualified_name(self.modeler.distribution), 'fitted': True, 'std': abs(params.iloc[:, i]), # Pending for issue 'mean': params.iloc[:, i + 1], # https://github.com/HDI-Project/SDV/issues/58 } model_params = { 'covariance': covariance, 'distribs': distributions, 'type': get_qualified_name(self.modeler.model), 'fitted': True, 'distribution': get_qualified_name(self.modeler.distribution) } return self.modeler.model.from_dict(model_params)
def unflatten_model(self, parent_row, table_name, parent_name): """ Takes the params from a generated parent row and creates a model from it. Args: parent_row (dataframe): a generated parent row table_name (string): name of table to make model for parent_name (string): name of parent table """ prefix = '__{}__'.format(table_name) columns = [ column for column in parent_row.columns if column.startswith(prefix) ] new_columns = { column: column.replace(prefix, '') for column in columns } flat_parameters = parent_row.loc[:, columns] flat_parameters = flat_parameters.rename( columns=new_columns).to_dict('records')[0] model_parameters = self._unflatten_dict(flat_parameters, table_name) model_name = get_qualified_name(self.modeler.model) model_parameters['fitted'] = True model_parameters['type'] = model_name if model_name == GAUSSIAN_COPULA: model_parameters = self._unflatten_gaussian_copula( model_parameters) return self.modeler.model.from_dict(model_parameters)
def flatten_model(self, model, name=''): """Flatten a model's parameters into an array. Args: model(self.model): Instance of model. name (str): Prefix to the parameter name. Returns: pd.Series: parameters for model """ if self.model == DEFAULT_MODEL: values = [] triangle = np.tril(model.covariance) for index, row in enumerate(triangle.tolist()): values.append(row[:index + 1]) model.covariance = np.array(values) if self.model_kwargs['distribution'] == get_qualified_name(DEFAULT_DISTRIBUTION): transformer = PositiveNumberTransformer({ 'name': 'field', 'type': 'number' }) for distribution in model.distribs.values(): column = pd.DataFrame({'field': [distribution.std]}) distribution.std = transformer.reverse_transform(column).loc[0, 'field'] return pd.Series(self._flatten_dict(model.to_dict(), name))
def to_dict(self): result = { 'type': get_qualified_name(self), 'vine_type': self.vine_type, 'fitted': self.fitted } if not self.fitted: return result result.update({ 'n_sample': self.n_sample, 'n_var': self.n_var, 'depth': self.depth, 'truncated': self.truncated, 'trees': [tree.to_dict() for tree in self.trees], 'tau_mat': self.tau_mat.tolist(), 'u_matrix': self.u_matrix.tolist(), 'unis': [distribution.to_dict() for distribution in self.unis], }) return result
def to_dict(self): """Return a `dict` with the parameters to replicate this Vine. Returns: dict: Parameters of this Vine. """ result = { 'type': get_qualified_name(self), 'vine_type': self.vine_type, 'fitted': self.fitted } if not self.fitted: return result result.update({ 'n_sample': self.n_sample, 'n_var': self.n_var, 'depth': self.depth, 'truncated': self.truncated, 'trees': [tree.to_dict() for tree in self.trees], 'tau_mat': self.tau_mat.tolist(), 'u_matrix': self.u_matrix.tolist(), 'unis': [distribution.to_dict() for distribution in self.unis], 'columns': self.columns }) return result
def to_dict(self): """Returns parameters to replicate the distribution.""" result = {'type': get_qualified_name(self), 'fitted': self.fitted} if not self.fitted: return result if get_qualified_name(self) == get_qualified_name(Univariate): if self.constant_value is not None: result["constant_value"] = self.constant_value return result else: result['instance_type'] = get_qualified_name(self._instance) result.update(self._fit_params()) return result
def _get_model_dict(self, data): """Fit and serialize a model and flatten its parameters into an array. Args: data(pandas.DataFrame): Dataset to fit the model to. Returns: dict: Flattened parameters for model. """ model = self.fit_model(data) if self.model == DEFAULT_MODEL: values = [] triangle = np.tril(model.covariance) for index, row in enumerate(triangle.tolist()): values.append(row[:index + 1]) model.covariance = np.array(values) if self.model_kwargs['distribution'] == get_qualified_name( DEFAULT_DISTRIBUTION): transformer = PositiveNumberTransformer({ 'name': 'field', 'type': 'number' }) for distribution in model.distribs.values(): column = pd.DataFrame({'field': [distribution.std]}) distribution.std = transformer.reverse_transform( column).loc[0, 'field'] return self._flatten_dict(model.to_dict())
def to_dict(self): """Return a `dict` with the parameters to replicate this Tree. Returns: dict: Parameters of this Tree. """ fitted = self.fitted result = { 'tree_type': self.tree_type, 'type': get_qualified_name(self), 'fitted': fitted } if not fitted: return result result.update({ 'level': self.level, 'n_nodes': self.n_nodes, 'tau_matrix': self.tau_matrix.tolist(), 'previous_tree': self._serialize_previous_tree(), 'edges': [edge.to_dict() for edge in self.edges], }) return result
def test_fit_distribution_selector(self): """ On fit, it should use the correct distributions for those that are specified and default to using the base class otherwise. """ copula = GaussianMultivariate(distribution={ 'column1': 'copulas.univariate.beta.BetaUnivariate', 'column2': 'copulas.univariate.gaussian_kde.GaussianKDE', }) copula.fit(self.data) assert get_qualified_name( copula.univariates[0].__class__) == 'copulas.univariate.beta.BetaUnivariate' assert get_qualified_name( copula.univariates[1].__class__) == 'copulas.univariate.gaussian_kde.GaussianKDE' assert get_qualified_name( copula.univariates[2].__class__) == 'copulas.univariate.base.Univariate'
def from_dict(cls, param_dict): """Create new instance from dictionary.""" distribution_class = get_instance(param_dict['type']) if get_qualified_name(distribution_class) == get_qualified_name( Univariate): distribution_class.fitted = param_dict['fitted'] if distribution_class.fitted: if param_dict.get("constant_value", None) is not None: distribution_class.constant_value = param_dict[ "constant_value"] distribution_class._replace_constant_methods() else: instance_class = get_instance(param_dict['instance_type']) distribution_class._instance = instance_class.from_dict( param_dict) return distribution_class return distribution_class.from_dict(param_dict)
def to_dict(self): """Returns parameters to replicate the distribution.""" result = {'type': get_qualified_name(self), 'fitted': self.fitted} if not self.fitted: return result result.update(self._fit_params()) return result
def to_dict(self): """Return the parameters of this model in a dict. Returns: dict: Dictionary containing the distribution type and all the parameters that define the distribution. Raises: NotFittedError: if the model is not fitted. """ self.check_fit() params = self._get_params() if self.__class__ is Univariate: params['type'] = get_qualified_name(self._instance) else: params['type'] = get_qualified_name(self) return params
def _get_model(self, extension): """Build a model using the extension parameters.""" model_parameters = self._unflatten_dict(extension) model_name = get_qualified_name(self.modeler.model) model_parameters['fitted'] = True model_parameters['type'] = model_name if model_name == GAUSSIAN_COPULA: model_parameters = self._unflatten_gaussian_copula(model_parameters) return self.modeler.model.from_dict(model_parameters)
def to_dict(self): distributions = { name: distribution.to_dict() for name, distribution in self.distribs.items() } return { 'covariance': self.covariance.tolist(), 'distribs': distributions, 'type': get_qualified_name(self), 'fitted': self.fitted, 'distribution': self.distribution }
def to_dict(self): univariates = [univariate.to_dict() for univariate in self.univariates] distribution = self.distribution if isinstance(self.distribution, dict): distribution = {} for k, v in self.distribution.items(): distribution[k] = v.to_dict() return { 'covariance': self.covariance.tolist(), 'univariates': univariates, 'columns': self.columns, 'type': get_qualified_name(self), 'fitted': self.fitted, 'distribution': distribution }
def to_dict(self): """Return a `dict` with the parameters to replicate this object. Returns: dict: Parameters of this distribution. """ self.check_fit() univariates = [univariate.to_dict() for univariate in self.univariates] return { 'covariance': self.covariance.tolist(), 'univariates': univariates, 'columns': self.columns, 'type': get_qualified_name(self), }
def to_dict(self): """Return a `dict` with the parameters to replicate this object. Returns: dict: Parameters of this distribution. """ self.check_fit() univariates = [univariate.to_dict() for univariate in self.univariates] warnings.warn('`covariance` will be renamed to `correlation` in v0.4.0', DeprecationWarning) return { 'covariance': self.covariance.tolist(), 'univariates': univariates, 'columns': self.columns, 'type': get_qualified_name(self), }
def test_fit_distribution_arg(self): """On fit, the distributions for each column use instances of copula.distribution.""" # Setup distribution = 'copulas.univariate.gaussian_kde.GaussianKDE' copula = GaussianMultivariate(distribution=distribution) # Run copula.fit(self.data) # Check assert copula.distribution == 'copulas.univariate.gaussian_kde.GaussianKDE' for i, key in enumerate(self.data.columns): assert copula.columns[i] == key assert get_qualified_name(copula.univariates[i].__class__) == copula.distribution expected_covariance = copula._get_covariance(self.data) assert (copula.covariance == expected_covariance).all().all()
def test_fit_distribution_arg(self): """On fit, the distributions for each column use instances of copula.distribution.""" # Setup distribution = 'copulas.univariate.kde.KDEUnivariate' copula = GaussianMultivariate(distribution=distribution) # Run copula.fit(self.data) # Check assert copula.distribution == 'copulas.univariate.kde.KDEUnivariate' for key in self.data.columns: assert key in copula.distribs assert get_qualified_name( copula.distribs[key].__class__) == copula.distribution expected_covariance = copula._get_covariance(self.data) assert (copula.covariance == expected_covariance).all().all()
def _sample_model(model, num_rows, columns): """Sample from model and format into pandas.DataFrame. Args: model(copula.multivariate.base): Fitted model. num_rows(int): Number of rows to sample. columns(Iterable): Column names for the sampled rows. Returns: pd.DataFrame: Sampled rows. """ if get_qualified_name(model) == 'copulas.multivariate.vine.VineCopula': synthesized = [model.sample(num_rows).tolist() for row in range(num_rows)] else: synthesized = model.sample(num_rows) return pd.DataFrame(synthesized, columns=columns)
def to_dict(self): fitted = self.fitted result = { 'tree_type': self.tree_type, 'type': get_qualified_name(self), 'fitted': fitted } if not fitted: return result result.update({ 'level': self.level, 'n_nodes': self.n_nodes, 'tau_matrix': self.tau_matrix.tolist(), 'previous_tree': self._serialize_previous_tree(), 'edges': [edge.to_dict() for edge in self.edges], }) return result
def test_fit_default_distribution(self): """On fit, a distribution is created for each column along the covariance and means""" # Setup copula = GaussianMultivariate() # Run copula.fit(self.data) # Check assert copula.distribution == 'copulas.univariate.gaussian.GaussianUnivariate' for key in self.data.columns: assert key in copula.distribs assert get_qualified_name( copula.distribs[key].__class__) == copula.distribution assert copula.distribs[key].mean == self.data[key].mean() assert copula.distribs[key].std == np.std(self.data[key]) expected_covariance = copula._get_covariance(self.data) assert (copula.covariance == expected_covariance).all().all()