def test_sample_random_state(self): """When random_state is set the samples are the same.""" # Setup instance = GaussianMultivariate(GaussianUnivariate, random_seed=0) data = pd.DataFrame([ {'A': 25, 'B': 75, 'C': 100}, {'A': 30, 'B': 60, 'C': 250}, {'A': 10, 'B': 65, 'C': 350}, {'A': 20, 'B': 80, 'C': 150}, {'A': 25, 'B': 70, 'C': 500} ]) instance.fit(data) expected_result = pd.DataFrame( np.array([ [25.19031668, 61.96527251, 543.43595269], [31.50262306, 49.70971698, 429.06537124], [20.31636799, 64.3492326, 384.27561823], [25.00302427, 72.06019812, 415.85215123], [23.07525773, 66.70901743, 390.8226672] ]), columns=['A', 'B', 'C'] ) # Run result = instance.sample(5) # Check pd.testing.assert_frame_equal(result, expected_result, check_less_precise=True)
def test_sample(self): """Generated samples keep the same mean and deviation as the original data.""" copula = GaussianMultivariate() stats = [{ 'mean': 10000, 'std': 15 }, { 'mean': 150, 'std': 10 }, { 'mean': -50, 'std': 0.1 }] data = pd.DataFrame( [np.random.normal(x['mean'], x['std'], 100) for x in stats]).T copula.fit(data) # Run result = copula.sample(1000000) # Check assert result.shape == (1000000, 3) for i, stat in enumerate(stats): expected_mean = np.mean(data[i]) expected_std = np.std(data[i]) result_mean = np.mean(result[i]) result_std = np.std(result[i]) assert abs(expected_mean - result_mean) < abs(expected_mean / 100) assert abs(expected_std - result_std) < abs(expected_std / 100)
def test_sample_constant_column(self): """Gaussian copula can sample after being fit with a constant column. This process will raise warnings when computing the covariance matrix """ # Setup instance = GaussianMultivariate() X = np.array([ [1.0, 2.0], [1.0, 3.0], [1.0, 4.0], [1.0, 5.0] ]) instance.fit(X) # Run result = instance.sample(5) # Check assert result.shape == (5, 2) assert result[~result.isnull()].all().all() assert result.loc[:, 0].equals(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0], name=0)) # This is to check that the samples on the non constant column are not constant too. assert len(result.loc[:, 1].unique()) > 1 covariance = instance.covariance assert (~pd.isnull(covariance)).all().all()
def test_sample_random_state(self): """When random_state is set the samples are the same.""" # Setup instance = GaussianMultivariate( random_seed=0, distribution='copulas.univariate.gaussian.GaussianUnivariate') data = pd.DataFrame([{ 'A': 25, 'B': 75, 'C': 100 }, { 'A': 30, 'B': 60, 'C': 250 }, { 'A': 10, 'B': 65, 'C': 350 }, { 'A': 20, 'B': 80, 'C': 150 }, { 'A': 25, 'B': 70, 'C': 500 }]) instance.fit(data) expected_result = pd.DataFrame([{ 'A': 25.566882482769294, 'B': 61.01690157277244, 'C': 575.71068885087790 }, { 'A': 32.624255560452110, 'B': 47.31477394460025, 'C': 447.84049148268970 }, { 'A': 20.117642182744806, 'B': 63.68224998298797, 'C': 397.76402526341593 }, { 'A': 25.357483201156676, 'B': 72.30337152729443, 'C': 433.06766240515134 }, { 'A': 23.202174689737113, 'B': 66.32056962524452, 'C': 405.08384853948280 }]) # Run result = instance.sample(5) # Check assert result.equals(expected_result)
def test_deprecation_warnings(self): """After fitting, Gaussian copula can produce new samples warningless.""" # Setup copula = GaussianMultivariate() data = pd.read_csv('data/iris.data.csv') # Run with warnings.catch_warnings(record=True) as warns: copula.fit(data) result = copula.sample(10) # Check assert len(warns) == 0 assert len(result) == 10
def test_sample(self, normal_mock): """Sample use the inverse-transform method to generate new samples.""" # Setup instance = GaussianMultivariate(GaussianUnivariate) data = pd.DataFrame([ {'A': 25, 'B': 75, 'C': 100}, {'A': 30, 'B': 60, 'C': 250}, {'A': 10, 'B': 65, 'C': 350}, {'A': 20, 'B': 80, 'C': 150}, {'A': 25, 'B': 70, 'C': 500} ]) instance.fit(data) normal_mock.return_value = np.array([ [0.1, 0.1, 0.1], [0.2, 0.2, 0.2], [0.4, 0.4, 0.4], [0.6, 0.6, 0.6], [0.8, 0.8, 0.8] ]) expected_result = pd.DataFrame([ {'A': 22.678232998312527, 'B': 70.70710678118655, 'C': 284.35270009440734}, {'A': 23.356465996625055, 'B': 71.41421356237309, 'C': 298.7054001888146}, {'A': 24.712931993250110, 'B': 72.82842712474618, 'C': 327.4108003776293}, {'A': 26.069397989875164, 'B': 74.24264068711929, 'C': 356.116200566444}, {'A': 27.425863986500215, 'B': 75.65685424949238, 'C': 384.8216007552586} ]) # Run result = instance.sample(5) # Check assert result.equals(expected_result) assert normal_mock.called_once_with( np.zeros(instance.covariance.shape[0]), instance.covariance, 5 )
P_mat = np.array([[0, -1, 0, 0, 1, 0], [0, -0.5, 1, -0.5, 0, 0]]) # views mu_v = np.array([0.0006, 0.0007]) # normal view mean sigma_v = np.array([0.05, 0.075]) # normal view sigma range_v = np.array([[0, 0.01], [0, 0.02]]) # uniform view range k_ = P_mat.shape[0] # number of views Conf_full = ones((k_, 1)) - 1e-6 # full confidence levels Conf = ones((k_, 1)) * 0.25 # half confidence levels # Market Information fit and simulation # Market Prior from Gaussian Copula Simulation gc = GaussianMultivariate() gc.fit(data) print(gc) M = gc.sample(1000) Utility.disp_stat([M], ['Prior']) MPrior = M.values name = 'True' print('Mean_{}'.format(name)) mean = np.expand_dims(data.mean(axis=0), axis=0).T print(data.mean(axis=0).round(2)) print('-------------------------------------------------\n') df_cov = pd.DataFrame(data=gc.covariance) gc_corr = Estimation.cov_2_corr(gc.covariance)[0] gc_sigvec = Estimation.cov_2_corr(gc.covariance)[1] df_corr = pd.DataFrame(data=gc_corr) print('Correlation_{}'.format(name)) print(df_corr.round(4)) print('-------------------------------------------------\n')
class ColumnsModel: """ColumnsModel class. The ``ColumnsModel`` class enables the usage of conditional sampling when a column is a ``constraint``. """ _columns_model = None def __init__(self, constraint, constraint_columns): if isinstance(constraint_columns, list): self.constraint_columns = constraint_columns else: self.constraint_columns = [constraint_columns] self.constraint = constraint def fit(self, table_data): """Fit the ``ColumnsModel``. Fit a ``GaussianUnivariate`` model to the ``self.constraint_column`` columns in the ``table_data`` in order to sample those columns when missing. Args: table_data (pandas.DataFrame): Table data. """ data_to_model = table_data[self.constraint_columns] self._hyper_transformer = HyperTransformer( default_data_type_transformers={ 'categorical': 'OneHotEncodingTransformer' }) transformed_data = self._hyper_transformer.fit_transform(data_to_model) self._model = GaussianMultivariate(distribution=GaussianUnivariate) self._model.fit(transformed_data) def _reject_sample(self, num_rows, conditions): sampled = self._model.sample(num_rows=num_rows, conditions=conditions) sampled = self._hyper_transformer.reverse_transform(sampled) valid_rows = sampled[self.constraint.is_valid(sampled)] counter = 0 total_sampled = num_rows while len(valid_rows) < num_rows: num_valid = len(valid_rows) if counter >= 100: if len(valid_rows) == 0: raise ValueError( 'Could not get enough valid rows within 100 trials.') else: multiplier = num_rows // num_valid num_rows_missing = num_rows % num_valid remainder_rows = valid_rows.iloc[0:num_rows_missing, :] valid_rows = pd.concat([valid_rows] * multiplier + [remainder_rows], ignore_index=True) break remaining = num_rows - num_valid valid_probability = (num_valid + 1) / (total_sampled + 1) max_rows = num_rows * 10 num_to_sample = min(int(remaining / valid_probability), max_rows) total_sampled += num_to_sample new_sampled = self._model.sample(num_rows=num_to_sample, conditions=conditions) new_sampled = self._hyper_transformer.reverse_transform( new_sampled) new_valid_rows = new_sampled[self.constraint.is_valid(new_sampled)] valid_rows = pd.concat([valid_rows, new_valid_rows], ignore_index=True) counter += 1 return valid_rows.iloc[0:num_rows, :] def sample(self, table_data): """Sample any missing columns. Sample any missing columns, ``self.constraint_columns``, that ``table_data`` does not contain. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Table data with additional ``constraint_columns``. """ condition_columns = [ c for c in self.constraint_columns if c in table_data.columns ] grouped_conditions = table_data[condition_columns].groupby( condition_columns) all_sampled_rows = list() for group, df in grouped_conditions: if not isinstance(group, tuple): group = [group] transformed_condition = self._hyper_transformer.transform( df).iloc[0].to_dict() sampled_rows = self._reject_sample( num_rows=df.shape[0], conditions=transformed_condition) all_sampled_rows.append(sampled_rows) sampled_data = pd.concat(all_sampled_rows, ignore_index=True) return sampled_data
class Constraint(metaclass=ConstraintMeta): """Constraint base class. This class is not intended to be used directly and should rather be subclassed to create different types of constraints. If ``handling_strategy`` is passed with the value ``transform`` or ``reject_sampling``, the ``filter_valid`` or ``transform`` and ``reverse_transform`` methods will be replaced respectively by a simple identity function. Attributes: constraint_columns (tuple[str]): The names of the columns used by this constraint. rebuild_columns (typle[str]): The names of the columns that this constraint will rebuild during ``reverse_transform``. Args: handling_strategy (str): How this Constraint should be handled, which can be ``transform``, ``reject_sampling`` or ``all``. fit_columns_model (bool): If False, reject sampling will be used to handle conditional sampling. Otherwise, a model will be trained and used to sample other columns based on the conditioned column. """ constraint_columns = () rebuild_columns = () _hyper_transformer = None _columns_model = None def _identity(self, table_data): return table_data def __init__(self, handling_strategy, fit_columns_model=False): self.fit_columns_model = fit_columns_model if handling_strategy == 'transform': self.filter_valid = self._identity elif handling_strategy == 'reject_sampling': self.rebuild_columns = () self.transform = self._identity self.reverse_transform = self._identity elif handling_strategy != 'all': raise ValueError( 'Unknown handling strategy: {}'.format(handling_strategy)) def _fit(self, table_data): del table_data def fit(self, table_data): """Fit ``Constraint`` class to data. If ``fit_columns_model`` is True, then this method will fit a ``GaussianCopula`` model to the relevant columns in ``table_data``. Subclasses can overwrite this method, or overwrite the ``_fit`` method if they will not be needing the model to handle conditional sampling. Args: table_data (pandas.DataFrame): Table data. """ self._fit(table_data) if self.fit_columns_model and len(self.constraint_columns) > 1: data_to_model = table_data[list(self.constraint_columns)] self._hyper_transformer = HyperTransformer(dtype_transformers={ 'O': 'one_hot_encoding', }) transformed_data = self._hyper_transformer.fit_transform( data_to_model) self._columns_model = GaussianMultivariate( distribution=GaussianUnivariate) self._columns_model.fit(transformed_data) def _transform(self, table_data): return table_data def _reject_sample(self, num_rows, conditions): sampled = self._columns_model.sample(num_rows=num_rows, conditions=conditions) sampled = self._hyper_transformer.reverse_transform(sampled) valid_rows = sampled[self.is_valid(sampled)] counter = 0 total_sampled = num_rows while len(valid_rows) < num_rows: num_valid = len(valid_rows) if counter >= 100: if len(valid_rows) == 0: error = 'Could not get enough valid rows within 100 trials.' raise ValueError(error) else: multiplier = num_rows // num_valid num_rows_missing = num_rows % num_valid remainder_rows = valid_rows.iloc[0:num_rows_missing, :] valid_rows = pd.concat([valid_rows] * multiplier + [remainder_rows], ignore_index=True) break remaining = num_rows - num_valid valid_probability = (num_valid + 1) / (total_sampled + 1) max_rows = num_rows * 10 num_to_sample = min(int(remaining / valid_probability), max_rows) total_sampled += num_to_sample new_sampled = self._columns_model.sample(num_rows=num_to_sample, conditions=conditions) new_sampled = self._hyper_transformer.reverse_transform( new_sampled) new_valid_rows = new_sampled[self.is_valid(new_sampled)] valid_rows = pd.concat([valid_rows, new_valid_rows], ignore_index=True) counter += 1 return valid_rows.iloc[0:num_rows, :] def _sample_constraint_columns(self, table_data): condition_columns = [ c for c in self.constraint_columns if c in table_data.columns ] grouped_conditions = table_data[condition_columns].groupby( condition_columns) all_sampled_rows = list() for group, df in grouped_conditions: if not isinstance(group, tuple): group = [group] transformed_condition = self._hyper_transformer.transform( df).iloc[0].to_dict() sampled_rows = self._reject_sample( num_rows=df.shape[0], conditions=transformed_condition) all_sampled_rows.append(sampled_rows) sampled_data = pd.concat(all_sampled_rows, ignore_index=True) return sampled_data def _validate_constraint_columns(self, table_data): """Validate the columns in ``table_data``. If ``fit_columns_model`` is False and any columns in ``constraint_columns`` are not present in ``table_data``, this method will raise a ``MissingConstraintColumnError``. Otherwise it will return the ``table_data`` unchanged. If ``fit_columns_model`` is True, then this method will sample any missing ``constraint_columns`` from its model conditioned on the ``constraint_columns`` that ``table_data`` does contain. If ``table_data`` doesn't contain any of the ``constraint_columns`` then a ``MissingConstraintColumnError`` will be raised. Args: table_data (pandas.DataFrame): Table data. """ missing_columns = [ col for col in self.constraint_columns if col not in table_data.columns ] if missing_columns: if not self._columns_model: warning_message = ( 'When `fit_columns_model` is False and we are conditioning on a subset ' 'of the constraint columns, conditional sampling uses reject sampling ' 'which can be slow. Changing `fit_columns_model` to True can improve ' 'the performance.') warnings.warn(warning_message, UserWarning) all_columns_missing = len(missing_columns) == len( self.constraint_columns) if self._columns_model is None or all_columns_missing: raise MissingConstraintColumnError() else: sampled_data = self._sample_constraint_columns(table_data) other_columns = [ c for c in table_data.columns if c not in self.constraint_columns ] sampled_data[other_columns] = table_data[other_columns] return sampled_data return table_data def transform(self, table_data): """Perform necessary transformations needed by constraint. Subclasses can optionally overwrite this method. If the transformation requires certain columns to be present in ``table_data``, then the subclass should overwrite the ``_transform`` method instead. This method raises a ``MissingConstraintColumnError`` if the ``table_data`` is missing any columns needed to do the transformation. If columns are present, this method will call the ``_transform`` method. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ table_data = self._validate_constraint_columns(table_data) return self._transform(table_data) def fit_transform(self, table_data): """Fit this Constraint to the data and then transform it. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Transformed data. """ self.fit(table_data) return self.transform(table_data) def reverse_transform(self, table_data): """Identity method for completion. To be optionally overwritten by subclasses. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ return table_data def is_valid(self, table_data): """Say whether the given table rows are valid. This is a dummy version of the method that returns a series of ``True`` values to avoid dropping any rows. This should be overwritten by all the subclasses that have a way to decide which rows are valid and which are not. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.Series: Series of ``True`` values """ return pd.Series(True, index=table_data.index) def filter_valid(self, table_data): """Get only the rows that are valid. The filtering is done by calling the method ``is_valid``, which should be overwritten by subclasses, while this method should stay untouched. Args: table_data (pandas.DataFrame): Table data. Returns: pandas.DataFrame: Input data unmodified. """ valid = self.is_valid(table_data) invalid = sum(~valid) if invalid: LOGGER.debug('%s: %s invalid rows out of %s.', self.__class__.__name__, sum(~valid), len(valid)) if isinstance(valid, pd.Series): return table_data[valid.values] return table_data[valid] @classmethod def from_dict(cls, constraint_dict): """Build a Constraint object from a dict. Args: constraint_dict (dict): Dict containing the keyword ``constraint`` alongside any additional arguments needed to create the instance. Returns: Constraint: New constraint instance. """ constraint_dict = constraint_dict.copy() constraint_class = constraint_dict.pop('constraint') subclasses = get_subclasses(cls) if isinstance(constraint_class, str): if '.' in constraint_class: constraint_class = import_object(constraint_class) else: constraint_class = subclasses[constraint_class] return constraint_class(**constraint_dict) def to_dict(self): """Return a dict representation of this Constraint. The dictionary will contain the Qualified Name of the constraint class in the key ``constraint``, as well as any other arguments that were passed to the constructor when the instance was created. Returns: dict: Dict representation of this Constraint. """ constraint_dict = { 'constraint': _get_qualified_name(self.__class__), } for key, obj in copy.deepcopy(self.__kwargs__).items(): if callable(obj) and _module_contains_callable_name(obj): constraint_dict[key] = _get_qualified_name(obj) else: constraint_dict[key] = obj return constraint_dict