コード例 #1
0
    def test_sample_random_state(self):
        """When random_state is set the samples are the same."""
        # Setup
        instance = GaussianMultivariate(GaussianUnivariate, random_seed=0)
        data = pd.DataFrame([
            {'A': 25, 'B': 75, 'C': 100},
            {'A': 30, 'B': 60, 'C': 250},
            {'A': 10, 'B': 65, 'C': 350},
            {'A': 20, 'B': 80, 'C': 150},
            {'A': 25, 'B': 70, 'C': 500}
        ])
        instance.fit(data)

        expected_result = pd.DataFrame(
            np.array([
                [25.19031668, 61.96527251, 543.43595269],
                [31.50262306, 49.70971698, 429.06537124],
                [20.31636799, 64.3492326, 384.27561823],
                [25.00302427, 72.06019812, 415.85215123],
                [23.07525773, 66.70901743, 390.8226672]
            ]),
            columns=['A', 'B', 'C']
        )

        # Run
        result = instance.sample(5)

        # Check
        pd.testing.assert_frame_equal(result, expected_result, check_less_precise=True)
コード例 #2
0
    def test_sample(self):
        """Generated samples keep the same mean and deviation as the original data."""
        copula = GaussianMultivariate()
        stats = [{
            'mean': 10000,
            'std': 15
        }, {
            'mean': 150,
            'std': 10
        }, {
            'mean': -50,
            'std': 0.1
        }]
        data = pd.DataFrame(
            [np.random.normal(x['mean'], x['std'], 100) for x in stats]).T
        copula.fit(data)

        # Run
        result = copula.sample(1000000)

        # Check
        assert result.shape == (1000000, 3)
        for i, stat in enumerate(stats):
            expected_mean = np.mean(data[i])
            expected_std = np.std(data[i])
            result_mean = np.mean(result[i])
            result_std = np.std(result[i])

            assert abs(expected_mean - result_mean) < abs(expected_mean / 100)
            assert abs(expected_std - result_std) < abs(expected_std / 100)
コード例 #3
0
    def test_sample_constant_column(self):
        """Gaussian copula can sample after being fit with a constant column.

        This process will raise warnings when computing the covariance matrix
        """
        # Setup
        instance = GaussianMultivariate()
        X = np.array([
            [1.0, 2.0],
            [1.0, 3.0],
            [1.0, 4.0],
            [1.0, 5.0]
        ])
        instance.fit(X)

        # Run
        result = instance.sample(5)

        # Check
        assert result.shape == (5, 2)
        assert result[~result.isnull()].all().all()
        assert result.loc[:, 0].equals(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0], name=0))

        # This is to check that the samples on the non constant column are not constant too.
        assert len(result.loc[:, 1].unique()) > 1

        covariance = instance.covariance
        assert (~pd.isnull(covariance)).all().all()
コード例 #4
0
    def test_sample_random_state(self):
        """When random_state is set the samples are the same."""
        # Setup
        instance = GaussianMultivariate(
            random_seed=0,
            distribution='copulas.univariate.gaussian.GaussianUnivariate')
        data = pd.DataFrame([{
            'A': 25,
            'B': 75,
            'C': 100
        }, {
            'A': 30,
            'B': 60,
            'C': 250
        }, {
            'A': 10,
            'B': 65,
            'C': 350
        }, {
            'A': 20,
            'B': 80,
            'C': 150
        }, {
            'A': 25,
            'B': 70,
            'C': 500
        }])
        instance.fit(data)

        expected_result = pd.DataFrame([{
            'A': 25.566882482769294,
            'B': 61.01690157277244,
            'C': 575.71068885087790
        }, {
            'A': 32.624255560452110,
            'B': 47.31477394460025,
            'C': 447.84049148268970
        }, {
            'A': 20.117642182744806,
            'B': 63.68224998298797,
            'C': 397.76402526341593
        }, {
            'A': 25.357483201156676,
            'B': 72.30337152729443,
            'C': 433.06766240515134
        }, {
            'A': 23.202174689737113,
            'B': 66.32056962524452,
            'C': 405.08384853948280
        }])

        # Run
        result = instance.sample(5)

        # Check
        assert result.equals(expected_result)
コード例 #5
0
    def test_deprecation_warnings(self):
        """After fitting, Gaussian copula can produce new samples warningless."""
        # Setup
        copula = GaussianMultivariate()
        data = pd.read_csv('data/iris.data.csv')

        # Run
        with warnings.catch_warnings(record=True) as warns:
            copula.fit(data)
            result = copula.sample(10)

            # Check
            assert len(warns) == 0
            assert len(result) == 10
コード例 #6
0
    def test_sample(self, normal_mock):
        """Sample use the inverse-transform method to generate new samples."""
        # Setup
        instance = GaussianMultivariate(GaussianUnivariate)
        data = pd.DataFrame([
            {'A': 25, 'B': 75, 'C': 100},
            {'A': 30, 'B': 60, 'C': 250},
            {'A': 10, 'B': 65, 'C': 350},
            {'A': 20, 'B': 80, 'C': 150},
            {'A': 25, 'B': 70, 'C': 500}
        ])
        instance.fit(data)

        normal_mock.return_value = np.array([
            [0.1, 0.1, 0.1],
            [0.2, 0.2, 0.2],
            [0.4, 0.4, 0.4],
            [0.6, 0.6, 0.6],
            [0.8, 0.8, 0.8]
        ])

        expected_result = pd.DataFrame([
            {'A': 22.678232998312527, 'B': 70.70710678118655, 'C': 284.35270009440734},
            {'A': 23.356465996625055, 'B': 71.41421356237309, 'C': 298.7054001888146},
            {'A': 24.712931993250110, 'B': 72.82842712474618, 'C': 327.4108003776293},
            {'A': 26.069397989875164, 'B': 74.24264068711929, 'C': 356.116200566444},
            {'A': 27.425863986500215, 'B': 75.65685424949238, 'C': 384.8216007552586}
        ])

        # Run
        result = instance.sample(5)

        # Check
        assert result.equals(expected_result)

        assert normal_mock.called_once_with(
            np.zeros(instance.covariance.shape[0]),
            instance.covariance,
            5
        )
コード例 #7
0
P_mat = np.array([[0, -1, 0, 0, 1, 0], [0, -0.5, 1, -0.5, 0, 0]])  # views
mu_v = np.array([0.0006, 0.0007])  # normal view mean
sigma_v = np.array([0.05, 0.075])  # normal view sigma
range_v = np.array([[0, 0.01], [0, 0.02]])  # uniform view range
k_ = P_mat.shape[0]  # number of views
Conf_full = ones((k_, 1)) - 1e-6  # full confidence levels
Conf = ones((k_, 1)) * 0.25  # half confidence levels

# Market Information fit and simulation

# Market Prior from Gaussian Copula Simulation
gc = GaussianMultivariate()
gc.fit(data)
print(gc)
M = gc.sample(1000)
Utility.disp_stat([M], ['Prior'])
MPrior = M.values

name = 'True'
print('Mean_{}'.format(name))
mean = np.expand_dims(data.mean(axis=0), axis=0).T
print(data.mean(axis=0).round(2))
print('-------------------------------------------------\n')
df_cov = pd.DataFrame(data=gc.covariance)
gc_corr = Estimation.cov_2_corr(gc.covariance)[0]
gc_sigvec = Estimation.cov_2_corr(gc.covariance)[1]
df_corr = pd.DataFrame(data=gc_corr)
print('Correlation_{}'.format(name))
print(df_corr.round(4))
print('-------------------------------------------------\n')
コード例 #8
0
class ColumnsModel:
    """ColumnsModel class.

    The ``ColumnsModel`` class enables the usage of conditional sampling when a column is a
    ``constraint``.
    """

    _columns_model = None

    def __init__(self, constraint, constraint_columns):
        if isinstance(constraint_columns, list):
            self.constraint_columns = constraint_columns
        else:
            self.constraint_columns = [constraint_columns]

        self.constraint = constraint

    def fit(self, table_data):
        """Fit the ``ColumnsModel``.

        Fit a ``GaussianUnivariate`` model to the ``self.constraint_column`` columns in the
        ``table_data`` in order to sample those columns when missing.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        data_to_model = table_data[self.constraint_columns]
        self._hyper_transformer = HyperTransformer(
            default_data_type_transformers={
                'categorical': 'OneHotEncodingTransformer'
            })
        transformed_data = self._hyper_transformer.fit_transform(data_to_model)
        self._model = GaussianMultivariate(distribution=GaussianUnivariate)
        self._model.fit(transformed_data)

    def _reject_sample(self, num_rows, conditions):
        sampled = self._model.sample(num_rows=num_rows, conditions=conditions)
        sampled = self._hyper_transformer.reverse_transform(sampled)
        valid_rows = sampled[self.constraint.is_valid(sampled)]
        counter = 0
        total_sampled = num_rows

        while len(valid_rows) < num_rows:
            num_valid = len(valid_rows)
            if counter >= 100:
                if len(valid_rows) == 0:
                    raise ValueError(
                        'Could not get enough valid rows within 100 trials.')
                else:
                    multiplier = num_rows // num_valid
                    num_rows_missing = num_rows % num_valid
                    remainder_rows = valid_rows.iloc[0:num_rows_missing, :]
                    valid_rows = pd.concat([valid_rows] * multiplier +
                                           [remainder_rows],
                                           ignore_index=True)
                    break

            remaining = num_rows - num_valid
            valid_probability = (num_valid + 1) / (total_sampled + 1)
            max_rows = num_rows * 10
            num_to_sample = min(int(remaining / valid_probability), max_rows)
            total_sampled += num_to_sample
            new_sampled = self._model.sample(num_rows=num_to_sample,
                                             conditions=conditions)
            new_sampled = self._hyper_transformer.reverse_transform(
                new_sampled)
            new_valid_rows = new_sampled[self.constraint.is_valid(new_sampled)]
            valid_rows = pd.concat([valid_rows, new_valid_rows],
                                   ignore_index=True)
            counter += 1

        return valid_rows.iloc[0:num_rows, :]

    def sample(self, table_data):
        """Sample any missing columns.

        Sample any missing columns, ``self.constraint_columns``, that ``table_data``
        does not contain.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Table data with additional ``constraint_columns``.
        """
        condition_columns = [
            c for c in self.constraint_columns if c in table_data.columns
        ]
        grouped_conditions = table_data[condition_columns].groupby(
            condition_columns)
        all_sampled_rows = list()
        for group, df in grouped_conditions:
            if not isinstance(group, tuple):
                group = [group]

            transformed_condition = self._hyper_transformer.transform(
                df).iloc[0].to_dict()
            sampled_rows = self._reject_sample(
                num_rows=df.shape[0], conditions=transformed_condition)
            all_sampled_rows.append(sampled_rows)

        sampled_data = pd.concat(all_sampled_rows, ignore_index=True)
        return sampled_data
コード例 #9
0
class Constraint(metaclass=ConstraintMeta):
    """Constraint base class.

    This class is not intended to be used directly and should rather be
    subclassed to create different types of constraints.

    If ``handling_strategy`` is passed with the value ``transform``
    or ``reject_sampling``, the ``filter_valid`` or ``transform`` and
    ``reverse_transform`` methods will be replaced respectively by a simple
    identity function.

    Attributes:
        constraint_columns (tuple[str]):
            The names of the columns used by this constraint.
        rebuild_columns (typle[str]):
            The names of the columns that this constraint will rebuild during
            ``reverse_transform``.
    Args:
        handling_strategy (str):
            How this Constraint should be handled, which can be ``transform``,
            ``reject_sampling`` or ``all``.
        fit_columns_model (bool):
            If False, reject sampling will be used to handle conditional sampling.
            Otherwise, a model will be trained and used to sample other columns
            based on the conditioned column.
    """

    constraint_columns = ()
    rebuild_columns = ()
    _hyper_transformer = None
    _columns_model = None

    def _identity(self, table_data):
        return table_data

    def __init__(self, handling_strategy, fit_columns_model=False):
        self.fit_columns_model = fit_columns_model
        if handling_strategy == 'transform':
            self.filter_valid = self._identity
        elif handling_strategy == 'reject_sampling':
            self.rebuild_columns = ()
            self.transform = self._identity
            self.reverse_transform = self._identity
        elif handling_strategy != 'all':
            raise ValueError(
                'Unknown handling strategy: {}'.format(handling_strategy))

    def _fit(self, table_data):
        del table_data

    def fit(self, table_data):
        """Fit ``Constraint`` class to data.

        If ``fit_columns_model`` is True, then this method will fit
        a ``GaussianCopula`` model to the relevant columns in ``table_data``.
        Subclasses can overwrite this method, or overwrite the ``_fit`` method
        if they will not be needing the model to handle conditional sampling.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        self._fit(table_data)

        if self.fit_columns_model and len(self.constraint_columns) > 1:
            data_to_model = table_data[list(self.constraint_columns)]
            self._hyper_transformer = HyperTransformer(dtype_transformers={
                'O': 'one_hot_encoding',
            })
            transformed_data = self._hyper_transformer.fit_transform(
                data_to_model)
            self._columns_model = GaussianMultivariate(
                distribution=GaussianUnivariate)
            self._columns_model.fit(transformed_data)

    def _transform(self, table_data):
        return table_data

    def _reject_sample(self, num_rows, conditions):
        sampled = self._columns_model.sample(num_rows=num_rows,
                                             conditions=conditions)
        sampled = self._hyper_transformer.reverse_transform(sampled)
        valid_rows = sampled[self.is_valid(sampled)]
        counter = 0
        total_sampled = num_rows

        while len(valid_rows) < num_rows:
            num_valid = len(valid_rows)
            if counter >= 100:
                if len(valid_rows) == 0:
                    error = 'Could not get enough valid rows within 100 trials.'
                    raise ValueError(error)
                else:
                    multiplier = num_rows // num_valid
                    num_rows_missing = num_rows % num_valid
                    remainder_rows = valid_rows.iloc[0:num_rows_missing, :]
                    valid_rows = pd.concat([valid_rows] * multiplier +
                                           [remainder_rows],
                                           ignore_index=True)
                    break

            remaining = num_rows - num_valid
            valid_probability = (num_valid + 1) / (total_sampled + 1)
            max_rows = num_rows * 10
            num_to_sample = min(int(remaining / valid_probability), max_rows)
            total_sampled += num_to_sample
            new_sampled = self._columns_model.sample(num_rows=num_to_sample,
                                                     conditions=conditions)
            new_sampled = self._hyper_transformer.reverse_transform(
                new_sampled)
            new_valid_rows = new_sampled[self.is_valid(new_sampled)]
            valid_rows = pd.concat([valid_rows, new_valid_rows],
                                   ignore_index=True)
            counter += 1

        return valid_rows.iloc[0:num_rows, :]

    def _sample_constraint_columns(self, table_data):
        condition_columns = [
            c for c in self.constraint_columns if c in table_data.columns
        ]
        grouped_conditions = table_data[condition_columns].groupby(
            condition_columns)
        all_sampled_rows = list()
        for group, df in grouped_conditions:
            if not isinstance(group, tuple):
                group = [group]

            transformed_condition = self._hyper_transformer.transform(
                df).iloc[0].to_dict()
            sampled_rows = self._reject_sample(
                num_rows=df.shape[0], conditions=transformed_condition)
            all_sampled_rows.append(sampled_rows)

        sampled_data = pd.concat(all_sampled_rows, ignore_index=True)
        return sampled_data

    def _validate_constraint_columns(self, table_data):
        """Validate the columns in ``table_data``.

        If ``fit_columns_model`` is False and any columns in ``constraint_columns``
        are not present in ``table_data``, this method will raise a
        ``MissingConstraintColumnError``. Otherwise it will return the ``table_data``
        unchanged. If ``fit_columns_model`` is True, then this method will sample
        any missing ``constraint_columns`` from its model conditioned on the
        ``constraint_columns`` that ``table_data`` does contain. If ``table_data``
        doesn't contain any of the ``constraint_columns`` then a
        ``MissingConstraintColumnError`` will be raised.

        Args:
            table_data (pandas.DataFrame):
                Table data.
        """
        missing_columns = [
            col for col in self.constraint_columns
            if col not in table_data.columns
        ]
        if missing_columns:
            if not self._columns_model:
                warning_message = (
                    'When `fit_columns_model` is False and we are conditioning on a subset '
                    'of the constraint columns, conditional sampling uses reject sampling '
                    'which can be slow. Changing `fit_columns_model` to True can improve '
                    'the performance.')
                warnings.warn(warning_message, UserWarning)

            all_columns_missing = len(missing_columns) == len(
                self.constraint_columns)
            if self._columns_model is None or all_columns_missing:
                raise MissingConstraintColumnError()

            else:
                sampled_data = self._sample_constraint_columns(table_data)
                other_columns = [
                    c for c in table_data.columns
                    if c not in self.constraint_columns
                ]
                sampled_data[other_columns] = table_data[other_columns]
                return sampled_data

        return table_data

    def transform(self, table_data):
        """Perform necessary transformations needed by constraint.

        Subclasses can optionally overwrite this method. If the transformation
        requires certain columns to be present in ``table_data``, then the subclass
        should overwrite the ``_transform`` method instead. This method raises a
        ``MissingConstraintColumnError`` if the ``table_data`` is missing any columns
        needed to do the transformation. If columns are present, this method will call
        the ``_transform`` method.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        table_data = self._validate_constraint_columns(table_data)
        return self._transform(table_data)

    def fit_transform(self, table_data):
        """Fit this Constraint to the data and then transform it.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Transformed data.
        """
        self.fit(table_data)
        return self.transform(table_data)

    def reverse_transform(self, table_data):
        """Identity method for completion. To be optionally overwritten by subclasses.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        return table_data

    def is_valid(self, table_data):
        """Say whether the given table rows are valid.

        This is a dummy version of the method that returns a series of ``True``
        values to avoid dropping any rows. This should be overwritten by all
        the subclasses that have a way to decide which rows are valid and which
        are not.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.Series:
                Series of ``True`` values
        """
        return pd.Series(True, index=table_data.index)

    def filter_valid(self, table_data):
        """Get only the rows that are valid.

        The filtering is done by calling the method ``is_valid``, which should
        be overwritten by subclasses, while this method should stay untouched.

        Args:
            table_data (pandas.DataFrame):
                Table data.

        Returns:
            pandas.DataFrame:
                Input data unmodified.
        """
        valid = self.is_valid(table_data)
        invalid = sum(~valid)
        if invalid:
            LOGGER.debug('%s: %s invalid rows out of %s.',
                         self.__class__.__name__, sum(~valid), len(valid))

        if isinstance(valid, pd.Series):
            return table_data[valid.values]

        return table_data[valid]

    @classmethod
    def from_dict(cls, constraint_dict):
        """Build a Constraint object from a dict.

        Args:
            constraint_dict (dict):
                Dict containing the keyword ``constraint`` alongside
                any additional arguments needed to create the instance.

        Returns:
            Constraint:
                New constraint instance.
        """
        constraint_dict = constraint_dict.copy()
        constraint_class = constraint_dict.pop('constraint')
        subclasses = get_subclasses(cls)
        if isinstance(constraint_class, str):
            if '.' in constraint_class:
                constraint_class = import_object(constraint_class)
            else:
                constraint_class = subclasses[constraint_class]

        return constraint_class(**constraint_dict)

    def to_dict(self):
        """Return a dict representation of this Constraint.

        The dictionary will contain the Qualified Name of the constraint
        class in the key ``constraint``, as well as any other arguments
        that were passed to the constructor when the instance was created.

        Returns:
            dict:
                Dict representation of this Constraint.
        """
        constraint_dict = {
            'constraint': _get_qualified_name(self.__class__),
        }

        for key, obj in copy.deepcopy(self.__kwargs__).items():
            if callable(obj) and _module_contains_callable_name(obj):
                constraint_dict[key] = _get_qualified_name(obj)
            else:
                constraint_dict[key] = obj

        return constraint_dict