コード例 #1
0
ファイル: tvae.py プロジェクト: Tecnarca/dpgan-ecb
class TVAESynthesizer(BaseSynthesizer):
    """TVAESynthesizer."""
    def __init__(self,
                 embedding_dim=128,
                 compress_dims=(128, 128),
                 decompress_dims=(128, 128),
                 l2scale=1e-5,
                 batch_size=500,
                 epochs=300):

        self.embedding_dim = embedding_dim
        self.compress_dims = compress_dims
        self.decompress_dims = decompress_dims

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.loss_factor = 2
        self.epochs = epochs

        self._device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

    def fit(self, train_data, discrete_columns=tuple()):
        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)
        dataset = TensorDataset(
            torch.from_numpy(train_data.astype('float32')).to(self._device))
        loader = DataLoader(dataset,
                            batch_size=self.batch_size,
                            shuffle=True,
                            drop_last=True)

        data_dim = self.transformer.output_dimensions
        encoder = Encoder(data_dim, self.compress_dims,
                          self.embedding_dim).to(self._device)
        self.decoder = Decoder(self.embedding_dim, self.compress_dims,
                               data_dim).to(self._device)
        optimizerAE = Adam(list(encoder.parameters()) +
                           list(self.decoder.parameters()),
                           weight_decay=self.l2scale)

        for i in range(self.epochs):
            for id_, data in enumerate(loader):
                optimizerAE.zero_grad()
                real = data[0].to(self._device)
                mu, std, logvar = encoder(real)
                eps = torch.randn_like(std)
                emb = eps * std + mu
                rec, sigmas = self.decoder(emb)
                loss_1, loss_2 = loss_function(
                    rec, real, sigmas, mu, logvar,
                    self.transformer.output_info_list, self.loss_factor)
                loss = loss_1 + loss_2
                loss.backward()
                optimizerAE.step()
                self.decoder.sigma.data.clamp_(0.01, 1.0)

    def sample(self, samples):
        self.decoder.eval()

        steps = samples // self.batch_size + 1
        data = []
        for _ in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            noise = torch.normal(mean=mean, std=std).to(self._device)
            fake, sigmas = self.decoder(noise)
            fake = torch.tanh(fake)
            data.append(fake.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:samples]
        return self.transformer.inverse_transform(
            data,
            sigmas.detach().cpu().numpy())

    def set_device(self, device):
        self._device = device
        self.decoder.to(self._device)
コード例 #2
0
class CTGANSynthesizer(BaseSynthesizer):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.
    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        generator_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        discriminator_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        generator_lr (float):
            Learning rate for the generator. Defaults to 2e-4.
        generator_decay (float):
            Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
        discriminator_lr (float):
            Learning rate for the discriminator. Defaults to 2e-4.
        discriminator_decay (float):
            Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        verbose (boolean):
            Whether to have print statements for progress results. Defaults to ``False``.
        epochs (int):
            Number of training epochs. Defaults to 300.
    """
    def __init__(self,
                 embedding_dim=128,
                 generator_dim=(256, 256),
                 discriminator_dim=(256, 256),
                 generator_lr=2e-4,
                 generator_decay=1e-6,
                 discriminator_lr=2e-4,
                 discriminator_decay=0,
                 pack=1,
                 batch_size=500,
                 discriminator_steps=1,
                 log_frequency=True,
                 verbose=False,
                 epochs=300,
                 epsilon=10,
                 delta=1e-5,
                 noise_multiplier=2,
                 max_grad_norm=1,
                 dp=True):

        assert batch_size % 2 == 0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._pack = pack  #add this option to original CTGAN for swagness
        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self._epsilon = epsilon
        self._device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.trained_epochs = 0
        self.trained_epsilon = 0
        self._delta = delta
        self._noise_multiplier = noise_multiplier
        self.max_grad_norm = max_grad_norm
        self._dp = dp
        opacus.supported_layers_grad_samplers._create_or_extend_grad_sample = _custom_create_or_extend_grad_sample

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.

        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing
        Args:
            logits:
                […, num_features] unnormalized log probabilities
            tau:
                non-negative scalar temperature
            hard:
                if True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                a dimension along which softmax will be computed. Default: -1.
        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """
        if version.parse(torch.__version__) < version.parse("1.2.0"):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits,
                                                        tau=tau,
                                                        hard=hard,
                                                        eps=eps,
                                                        dim=dim)
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError("gumbel_softmax returning NaN.")

        return functional.gumbel_softmax(logits,
                                         tau=tau,
                                         hard=hard,
                                         eps=eps,
                                         dim=dim)

    def _apply_activate(self, data):
        """Apply proper activation function to the output of the generator."""
        data_t = []
        st = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if span_info.activation_fn == 'tanh':
                    ed = st + span_info.dim
                    data_t.append(torch.tanh(data[:, st:ed]))
                    st = ed
                elif span_info.activation_fn == 'softmax':
                    ed = st + span_info.dim
                    transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                    data_t.append(transformed)
                    st = ed
                else:
                    assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        """Compute the cross entropy loss on the fixed discrete column."""
        loss = []
        st = 0
        st_c = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if len(column_info
                       ) != 1 or span_info.activation_fn != "softmax":
                    # not discrete column
                    st += span_info.dim
                else:
                    ed = st + span_info.dim
                    ed_c = st_c + span_info.dim
                    tmp = functional.cross_entropy(data[:, st:ed],
                                                   torch.argmax(c[:,
                                                                  st_c:ed_c],
                                                                dim=1),
                                                   reduction='none')
                    loss.append(tmp)
                    st = ed
                    st_c = ed_c

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def _validate_discrete_columns(self, train_data, discrete_columns):
        """Check whether ``discrete_columns`` exists in ``train_data``.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        if isinstance(train_data, pd.DataFrame):
            invalid_columns = set(discrete_columns) - set(train_data.columns)
        elif isinstance(train_data, np.ndarray):
            invalid_columns = []
            for column in discrete_columns:
                if column < 0 or column >= train_data.shape[1]:
                    invalid_columns.append(column)
        else:
            raise TypeError(
                '``train_data`` should be either pd.DataFrame or np.array.')

        if invalid_columns:
            raise ValueError(
                'Invalid columns found: {}'.format(invalid_columns))

    def fit(self,
            train_data,
            discrete_columns=tuple(),
            epochs=None,
            epsilon=None):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self._validate_discrete_columns(train_data, discrete_columns)

        if epochs is None:
            epochs = self._epochs
        if epsilon is None:
            epsilon = self._epsilon
        if not self._dp:
            self.trained_epsilon = float("inf")

        self._transformer = DataTransformer()
        self._transformer.fit(train_data, discrete_columns)

        train_data = self._transformer.transform(train_data)

        self._data_sampler = DataSampler(train_data,
                                         self._transformer.output_info_list,
                                         self._log_frequency)

        data_dim = self._transformer.output_dimensions

        self._generator = Generator(
            self._embedding_dim + self._data_sampler.dim_cond_vec(),
            self._generator_dim, data_dim).to(self._device)

        self._discriminator = Discriminator(
            data_dim + self._data_sampler.dim_cond_vec(),
            self._discriminator_dim, self._pack).to(self._device)

        self._optimizerG = optim.Adam(self._generator.parameters(),
                                      lr=self._generator_lr,
                                      betas=(0.5, 0.9),
                                      weight_decay=self._generator_decay)

        self._optimizerD = optim.Adam(self._discriminator.parameters(),
                                      lr=self._discriminator_lr,
                                      betas=(0.5, 0.9),
                                      weight_decay=self._discriminator_decay)

        if self._dp:
            self._privacy_engine = PrivacyEngine(
                self._discriminator,
                self._batch_size / self._pack,
                len(train_data),
                alphas=[1 + x / 10.0
                        for x in range(1, 100)] + list(range(12, 64)),
                noise_multiplier=self._noise_multiplier,
                max_grad_norm=self.max_grad_norm,
                clip_per_layer=True,
                loss_reduction="sum",
            )
            self._privacy_engine.attach(self._optimizerD)

        mean = torch.zeros(self._batch_size,
                           self._embedding_dim,
                           device=self._device)
        std = mean + 1
        one = torch.tensor(1, dtype=torch.float).to(self._device)
        mone = one * -1

        steps_per_epoch = max(len(train_data) // self._batch_size, 1)
        for i in range(epochs):
            self.trained_epochs += 1

            if self._dp:
                if self.trained_epsilon >= self._epsilon:
                    print(
                        "Privacy budget of {:.2f} exausthed. Please specify an higher one in fit() to train more or disable differential privacy."
                        .format(self._epsilon))
                    return

            for id_ in range(steps_per_epoch):

                for n in range(self._discriminator_steps):
                    fakez = torch.normal(mean=mean, std=std)

                    condvec = self._data_sampler.sample_condvec(
                        self._batch_size)
                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = self._data_sampler.sample_data(
                            self._batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self._device)
                        m1 = torch.from_numpy(m1).to(self._device)
                        fakez = torch.cat([fakez, c1], dim=1)

                        perm = np.arange(self._batch_size)
                        np.random.shuffle(perm)
                        real = self._data_sampler.sample_data(
                            self._batch_size, col[perm], opt[perm])
                        c2 = c1[perm]

                    fake = self._generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype('float32')).to(
                        self._device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fake

                    self._optimizerD.zero_grad()

                    y_fake = self._discriminator(fake_cat)
                    y_real = self._discriminator(real_cat)

                    if not self._dp:
                        pen = self._discriminator.calc_gradient_penalty(
                            real_cat, fake_cat, self._device)
                        pen.backward(retain_graph=True)
                    loss_d = -torch.mean(y_real) + torch.mean(y_fake)

                    loss_d.backward()
                    self._optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self._data_sampler.sample_condvec(self._batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self._device)
                    m1 = torch.from_numpy(m1).to(self._device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self._generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = self._discriminator(
                        torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = self._discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                self._optimizerG.zero_grad()
                loss_g.backward()
                self._optimizerG.step()

                if self._dp:
                    for p in self._discriminator.parameters():
                        if hasattr(p, "grad_sample"):
                            del p.grad_sample

                    self.trained_epsilon, best_alpha = self._optimizerD.privacy_engine.get_privacy_spent(
                        self._delta)
                    if self.trained_epsilon >= epsilon:
                        print(
                            "Privacy budget of {:.2f} exausthed, training halted. Best alpha: {:.2f}"
                            .format(epsilon, best_alpha))
                        return

            if self._verbose:
                print(
                    f"Epoch {i+1}, epslion {self.trained_epsilon: .2f}, Loss G: {loss_g.detach().cpu(): .4f}, "
                    f"Loss D: {loss_d.detach().cpu(): .4f}",
                    flush=True)

        if self._dp:
            self._privacy_engine.detach()

    def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.
        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self._transformer.convert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
                condition_info, self._batch_size)
        else:
            global_condition_vec = None

        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self._data_sampler.sample_original_condvec(
                    self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)

    def set_device(self, device):
        self._device = device
        if hasattr(self, '_generator'):
            self._generator.to(self._device)
        if hasattr(self, '_discriminator'):
            self._discriminator.to(self._device)
コード例 #3
0
class CTGANSynthesizer(BaseSynthesizer):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        generator_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        discriminator_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        generator_lr (float):
            Learning rate for the generator. Defaults to 2e-4.
        generator_decay (float):
            Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
        discriminator_lr (float):
            Learning rate for the discriminator. Defaults to 2e-4.
        discriminator_decay (float):
            Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        verbose (boolean):
            Whether to have print statements for progress results. Defaults to ``False``.
        epochs (int):
            Number of training epochs. Defaults to 300.
        pac (int):
            Number of samples to group together when applying the discriminator.
            Defaults to 10.
        cuda (bool):
            Whether to attempt to use cuda for GPU computation.
            If this is False or CUDA is not available, CPU will be used.
            Defaults to ``True``.
    """

    def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256),
                 generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4,
                 discriminator_decay=1e-6, batch_size=500, discriminator_steps=1,
                 log_frequency=True, verbose=False, epochs=300, pac=10, cuda=True):

        assert batch_size % 2 == 0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self.pac = pac

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

        self._transformer = None
        self._data_sampler = None
        self._generator = None

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.

        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing

        Args:
            logits […, num_features]:
                Unnormalized log probabilities
            tau:
                Non-negative scalar temperature
            hard (bool):
                If True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                A dimension along which softmax will be computed. Default: -1.

        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """
        if version.parse(torch.__version__) < version.parse('1.2.0'):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard,
                                                        eps=eps, dim=dim)
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError('gumbel_softmax returning NaN.')

        return functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)

    def _apply_activate(self, data):
        """Apply proper activation function to the output of the generator."""
        data_t = []
        st = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if span_info.activation_fn == 'tanh':
                    ed = st + span_info.dim
                    data_t.append(torch.tanh(data[:, st:ed]))
                    st = ed
                elif span_info.activation_fn == 'softmax':
                    ed = st + span_info.dim
                    transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                    data_t.append(transformed)
                    st = ed
                else:
                    raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        """Compute the cross entropy loss on the fixed discrete column."""
        loss = []
        st = 0
        st_c = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if len(column_info) != 1 or span_info.activation_fn != 'softmax':
                    # not discrete column
                    st += span_info.dim
                else:
                    ed = st + span_info.dim
                    ed_c = st_c + span_info.dim
                    tmp = functional.cross_entropy(
                        data[:, st:ed],
                        torch.argmax(c[:, st_c:ed_c], dim=1),
                        reduction='none'
                    )
                    loss.append(tmp)
                    st = ed
                    st_c = ed_c

        loss = torch.stack(loss, dim=1)  # noqa: PD013

        return (loss * m).sum() / data.size()[0]

    def _validate_discrete_columns(self, train_data, discrete_columns):
        """Check whether ``discrete_columns`` exists in ``train_data``.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        if isinstance(train_data, pd.DataFrame):
            invalid_columns = set(discrete_columns) - set(train_data.columns)
        elif isinstance(train_data, np.ndarray):
            invalid_columns = []
            for column in discrete_columns:
                if column < 0 or column >= train_data.shape[1]:
                    invalid_columns.append(column)
        else:
            raise TypeError('``train_data`` should be either pd.DataFrame or np.array.')

        if invalid_columns:
            raise ValueError(f'Invalid columns found: {invalid_columns}')

    @random_state
    def fit(self, train_data, discrete_columns=(), epochs=None):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self._validate_discrete_columns(train_data, discrete_columns)

        if epochs is None:
            epochs = self._epochs
        else:
            warnings.warn(
                ('`epochs` argument in `fit` method has been deprecated and will be removed '
                 'in a future version. Please pass `epochs` to the constructor instead'),
                DeprecationWarning
            )

        self._transformer = DataTransformer()
        self._transformer.fit(train_data, discrete_columns)

        train_data = self._transformer.transform(train_data)

        self._data_sampler = DataSampler(
            train_data,
            self._transformer.output_info_list,
            self._log_frequency)

        data_dim = self._transformer.output_dimensions

        self._generator = Generator(
            self._embedding_dim + self._data_sampler.dim_cond_vec(),
            self._generator_dim,
            data_dim
        ).to(self._device)

        discriminator = Discriminator(
            data_dim + self._data_sampler.dim_cond_vec(),
            self._discriminator_dim,
            pac=self.pac
        ).to(self._device)

        optimizerG = optim.Adam(
            self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9),
            weight_decay=self._generator_decay
        )

        optimizerD = optim.Adam(
            discriminator.parameters(), lr=self._discriminator_lr,
            betas=(0.5, 0.9), weight_decay=self._discriminator_decay
        )

        mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self._batch_size, 1)
        for i in range(epochs):
            for id_ in range(steps_per_epoch):

                for n in range(self._discriminator_steps):
                    fakez = torch.normal(mean=mean, std=std)

                    condvec = self._data_sampler.sample_condvec(self._batch_size)
                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = self._data_sampler.sample_data(self._batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self._device)
                        m1 = torch.from_numpy(m1).to(self._device)
                        fakez = torch.cat([fakez, c1], dim=1)

                        perm = np.arange(self._batch_size)
                        np.random.shuffle(perm)
                        real = self._data_sampler.sample_data(
                            self._batch_size, col[perm], opt[perm])
                        c2 = c1[perm]

                    fake = self._generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype('float32')).to(self._device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fakeact

                    y_fake = discriminator(fake_cat)
                    y_real = discriminator(real_cat)

                    pen = discriminator.calc_gradient_penalty(
                        real_cat, fake_cat, self._device, self.pac)
                    loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                    optimizerD.zero_grad()
                    pen.backward(retain_graph=True)
                    loss_d.backward()
                    optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self._data_sampler.sample_condvec(self._batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self._device)
                    m1 = torch.from_numpy(m1).to(self._device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self._generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

            if self._verbose:
                print(f'Epoch {i+1}, Loss G: {loss_g.detach().cpu(): .4f},'  # noqa: T001
                      f'Loss D: {loss_d.detach().cpu(): .4f}',
                      flush=True)

    @random_state
    def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.

        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self._transformer.convert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
                condition_info, self._batch_size)
        else:
            global_condition_vec = None

        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self._data_sampler.sample_original_condvec(self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)

    def set_device(self, device):
        """Set the `device` to be used ('GPU' or 'CPU)."""
        self._device = device
        if self._generator is not None:
            self._generator.to(self._device)
コード例 #4
0
ファイル: ctgan.py プロジェクト: MLjungg/CTGAN_M-Thesis
class CTGANSynthesizer(BaseSynthesizer):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.
    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        generator_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        discriminator_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        generator_lr (float):
            Learning rate for the generator. Defaults to 2e-4.
        generator_decay (float):
            Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
        discriminator_lr (float):
            Learning rate for the discriminator. Defaults to 2e-4.
        discriminator_decay (float):
            Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        verbose (boolean):
            Whether to have print statements for progress results. Defaults to ``False``.
        epochs (int):
            Number of training epochs. Defaults to 300.
    """
    def __init__(self,
                 embedding_dim=128,
                 generator_dim=(256, 256),
                 discriminator_dim=(256, 256),
                 generator_lr=2e-4,
                 generator_decay=1e-6,
                 discriminator_lr=2e-4,
                 discriminator_decay=0,
                 batch_size=500,
                 discriminator_steps=1,
                 log_frequency=True,
                 verbose=False,
                 epochs=300,
                 external_eval_target=False,
                 adaptive_training=True):

        assert batch_size % 2 == 0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self._device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self._external_eval = False if external_eval_target == False else {
            "best_score": -np.inf,
            "correlation_scores": [],
            "detection_scores": [],
            "ml_efficacy_scores": {
                "tree": [],
                "adaboost": [],
                "regression": [],
                "mlp": [],
                "ml_efficacy": []
            },
            "target": external_eval_target
        }
        self._adaptive_training = False if adaptive_training == False else {
            "r_d": np.random.random(),
            "r_g": np.random.random(),
            "prev_loss_g": np.random.random(),
            "prev_loss_d": np.random.random(),
            "lambda": 1 / 3,
        }

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.

        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing
        Args:
            logits:
                […, num_features] unnormalized log probabilities
            tau:
                non-negative scalar temperature
            hard:
                if True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                a dimension along which softmax will be computed. Default: -1.
        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """
        if version.parse(torch.__version__) < version.parse("1.2.0"):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits,
                                                        tau=tau,
                                                        hard=hard,
                                                        eps=eps,
                                                        dim=dim)
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError("gumbel_softmax returning NaN.")

        return functional.gumbel_softmax(logits,
                                         tau=tau,
                                         hard=hard,
                                         eps=eps,
                                         dim=dim)

    def _apply_activate(self, data):
        """Apply proper activation function to the output of the generator."""
        data_t = []
        st = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if span_info.activation_fn == 'tanh':
                    ed = st + span_info.dim
                    data_t.append(torch.tanh(data[:, st:ed]))
                    st = ed
                elif span_info.activation_fn == 'softmax':
                    ed = st + span_info.dim
                    transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                    data_t.append(transformed)
                    st = ed
                else:
                    assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        """Compute the cross entropy loss on the fixed discrete column."""
        loss = []
        st = 0
        st_c = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if len(column_info
                       ) != 1 or span_info.activation_fn != "softmax":
                    # not discrete column
                    st += span_info.dim
                else:
                    ed = st + span_info.dim
                    ed_c = st_c + span_info.dim
                    tmp = functional.cross_entropy(data[:, st:ed],
                                                   torch.argmax(c[:,
                                                                  st_c:ed_c],
                                                                dim=1),
                                                   reduction='none')
                    loss.append(tmp)
                    st = ed
                    st_c = ed_c

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def _validate_discrete_columns(self, train_data, discrete_columns):
        """Check whether ``discrete_columns`` exists in ``train_data``.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        if isinstance(train_data, pd.DataFrame):
            invalid_columns = set(discrete_columns) - set(train_data.columns)
        elif isinstance(train_data, np.ndarray):
            invalid_columns = []
            for column in discrete_columns:
                if column < 0 or column >= train_data.shape[1]:
                    invalid_columns.append(column)
        else:
            raise TypeError(
                '``train_data`` should be either pd.DataFrame or np.array.')

        if invalid_columns:
            raise ValueError(
                'Invalid columns found: {}'.format(invalid_columns))

    def fit(self,
            train_data,
            discrete_columns=tuple(),
            epochs=None,
            metadata_top_layer=None):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        #self._validate_discrete_columns(train_data, discrete_columns)

        if epochs is None:
            epochs = self._epochs
        else:
            warnings.warn((
                '`epochs` argument in `fit` method has been deprecated and will be removed '
                'in a future version. Please pass `epochs` to the constructor instead'
            ), DeprecationWarning)

        self._transformer = DataTransformer()
        self._transformer.fit(train_data, discrete_columns)

        # Data structures for the intermediate eval function
        original_training_data = train_data.copy()
        self._external_eval["eval_size"] = min(len(train_data), 10000)

        train_data = self._transformer.transform(train_data)
        self._data_sampler = DataSampler(train_data,
                                         self._transformer.output_info_list,
                                         self._log_frequency)

        data_dim = self._transformer.output_dimensions

        self._generator = Generator(
            self._embedding_dim + self._data_sampler.dim_cond_vec(),
            self._generator_dim, data_dim).to(self._device)

        self._discriminator = Discriminator(
            data_dim + self._data_sampler.dim_cond_vec(),
            self._discriminator_dim).to(self._device)

        self._optimizerG = optim.Adam(self._generator.parameters(),
                                      lr=self._generator_lr,
                                      betas=(0.5, 0.9),
                                      weight_decay=self._generator_decay)

        self._optimizerD = optim.Adam(self._discriminator.parameters(),
                                      lr=self._discriminator_lr,
                                      betas=(0.5, 0.9),
                                      weight_decay=self._discriminator_decay)

        mean = torch.zeros(self._batch_size,
                           self._embedding_dim,
                           device=self._device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self._batch_size, 1)
        for i in range(epochs):
            for id_ in range(steps_per_epoch):
                if self._adaptive_training:
                    loss_g = self.calc_loss_g(mean, std)
                    pen, loss_d = self.calc_loss_d(mean, std)
                    if (self._adaptive_training["r_d"] >=
                        (self._adaptive_training["lambda"] *
                         self._adaptive_training["r_g"])):
                        self._optimizerD.zero_grad()
                        pen.backward(retain_graph=True)
                        loss_d.backward()
                        self._optimizerD.step()
                    else:
                        self._optimizerG.zero_grad()
                        loss_g.backward()
                        self._optimizerG.step()

                    loss_d = loss_d.detach().item()
                    loss_g = loss_g.detach().item()
                    self._adaptive_training["r_d"] = np.abs(
                        (loss_d - self._adaptive_training["prev_loss_d"]) /
                        self._adaptive_training["prev_loss_d"])
                    self._adaptive_training["r_g"] = np.abs(
                        (loss_g - self._adaptive_training["prev_loss_g"]) /
                        self._adaptive_training["prev_loss_g"])
                    self._adaptive_training["prev_loss_g"] = loss_g
                    self._adaptive_training["prev_loss_d"] = loss_d
                else:
                    for n in range(self._discriminator_steps):
                        self._optimizerD.zero_grad()
                        pen.backward(retain_graph=True)
                        loss_d.backward()
                        self._optimizerD.step()

                    self._optimizerG.zero_grad()
                    loss_g.backward()
                    self._optimizerG.step()

            if self._verbose:
                print("Epoch " + str(i + 1))
                pass

            if self._external_eval != False:
                if i % 10 == 0:
                    # Reverse data back to its original format to compute external eval scores
                    real_data = original_training_data.sample(
                        self._external_eval["eval_size"]).reset_index()
                    real_data = metadata_top_layer.reverse_transform(real_data)
                    synthetic_data = self.sample(
                        self._external_eval["eval_size"])
                    synthetic_data = metadata_top_layer.reverse_transform(
                        synthetic_data)
                    self.evaluate(synthetic_data, real_data, i + 1)

        if self._external_eval != False:
            self._generator = self._external_eval["best_generator"]

    def calc_loss_d(self, mean, std):
        fakez = torch.normal(mean=mean, std=std)

        condvec = self._data_sampler.sample_condvec(self._batch_size)
        if condvec is None:
            c1, m1, col, opt = None, None, None, None
            real = self._data_sampler.sample_data(self._batch_size, col, opt)
        else:
            c1, m1, col, opt = condvec
            c1 = torch.from_numpy(c1).to(self._device)
            m1 = torch.from_numpy(m1).to(self._device)
            fakez = torch.cat([fakez, c1], dim=1)

            perm = np.arange(self._batch_size)
            np.random.shuffle(perm)
            real = self._data_sampler.sample_data(self._batch_size, col[perm],
                                                  opt[perm])
            c2 = c1[perm]

        fake = self._generator(fakez)
        fakeact = self._apply_activate(fake)

        real = torch.from_numpy(real.astype('float32')).to(self._device)

        if c1 is not None:
            fake_cat = torch.cat([fakeact, c1], dim=1)
            real_cat = torch.cat([real, c2], dim=1)
        else:
            real_cat = real
            fake_cat = fake

        y_fake = self._discriminator(fake_cat)
        y_real = self._discriminator(real_cat)

        pen = self._discriminator.calc_gradient_penalty(
            real_cat, fake_cat, self._device)
        loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

        return pen, loss_d

    def calc_loss_g(self, mean, std):
        fakez = torch.normal(mean=mean, std=std)
        condvec = self._data_sampler.sample_condvec(self._batch_size)

        if condvec is None:
            c1, m1, col, opt = None, None, None, None
        else:
            c1, m1, col, opt = condvec
            c1 = torch.from_numpy(c1).to(self._device)
            m1 = torch.from_numpy(m1).to(self._device)
            fakez = torch.cat([fakez, c1], dim=1)

        fake = self._generator(fakez)
        fakeact = self._apply_activate(fake)

        if c1 is not None:
            y_fake = self._discriminator(torch.cat([fakeact, c1], dim=1))
        else:
            y_fake = self._discriminator(fakeact)

        if condvec is None:
            cross_entropy = 0
        else:
            cross_entropy = self._cond_loss(fake, c1, m1)

        loss_g = -torch.mean(y_fake) + cross_entropy

        return loss_g

    def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.
        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self._transformer.convert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
                condition_info, self._batch_size)
        else:
            global_condition_vec = None

        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self._data_sampler.sample_original_condvec(
                    self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)

    def evaluate(self, synthetic_data, real_data, epoch):
        categorical_cols = real_data.select_dtypes(
            "object").columns.values.tolist()

        correlation_score = self.compute_correlation_score(
            synthetic_data, real_data, categorical_cols)
        self._external_eval["correlation_scores"].append(correlation_score)

        ml_efficacy_score = self.compute_ml_efficacy(
            real_data, synthetic_data, self._external_eval["target"])

        detection_score = self.compute_detection(real_data, synthetic_data)
        self._external_eval["detection_scores"].append(detection_score)

        overall_score = (detection_score * 0.5 + ml_efficacy_score * 0.5)

        if self._external_eval["best_score"] < overall_score:
            print("New max score!")
            print(str(overall_score))
            self._external_eval["best_model_epoch"] = epoch
            self._external_eval["best_score"] = overall_score
            self._external_eval["best_generator"] = copy.deepcopy(
                self._generator)

    def get_problem_type(self, data, target):
        target_column = data[target]

        if target_column.dtypes == "object":
            unique_labels = np.unique(target_column)

            if len(unique_labels) == 2:
                problem_type = "binary_classification"

            else:
                problem_type = "multi_classification"
        else:
            raise AttributeError("Regression ml efficacy not yet implemented")

        return problem_type

    def compute_detection(self, real_data, synthetic_data, verbose=True):
        real_data = real_data.dropna()
        synthetic_data = synthetic_data.dropna()
        detection_score = LogisticDetection.compute(real_data, synthetic_data)

        if verbose:
            print("Detection score: " + str(detection_score))

        return detection_score

    def compute_ml_efficacy(self,
                            real_data,
                            synthetic_data,
                            target,
                            verbose=True):
        dtypes = real_data.dtypes.tolist()
        problem_type = self.get_problem_type(real_data, target)
        scores = []

        if problem_type == "binary_classification":
            (tree_score,
             _), _ = BinaryDecisionTreeClassifier.compute(real_data,
                                                          synthetic_data,
                                                          dtypes=dtypes.copy(),
                                                          target=target)
            (adaboost_score,
             _), _ = BinaryAdaBoostClassifier.compute(real_data,
                                                      synthetic_data,
                                                      dtypes=dtypes.copy(),
                                                      target=target)
            (regression_score,
             _), _ = BinaryLogisticRegression.compute(real_data,
                                                      synthetic_data,
                                                      dtypes=dtypes.copy(),
                                                      target=target)
            (mlp_score,
             _), _ = BinaryMLPClassifier.compute(real_data,
                                                 synthetic_data,
                                                 dtypes=dtypes.copy(),
                                                 target=target)

            scores.extend(
                [tree_score, adaboost_score, regression_score, mlp_score])
            ml_efficacy_score = sum(scores) / len(scores)

            self._external_eval["ml_efficacy_scores"]["tree"].append(
                tree_score)
            self._external_eval["ml_efficacy_scores"]["adaboost"].append(
                adaboost_score)
            self._external_eval["ml_efficacy_scores"]["regression"].append(
                regression_score)
            self._external_eval["ml_efficacy_scores"]["mlp"].append(mlp_score)
            self._external_eval["ml_efficacy_scores"]["ml_efficacy"].append(
                ml_efficacy_score)

            if verbose:
                print("Tree score: " + str(tree_score))
                print("Adaboost score: " + str(adaboost_score))
                print("Regression score: " + str(regression_score))
                print("Mlp score: " + str(mlp_score))
                print("ML efficay score: " + str(ml_efficacy_score))

        elif problem_type == "multi_classification":
            tree_score, _ = MulticlassDecisionTreeClassifier.compute(
                real_data, synthetic_data, dtypes=dtypes.copy(), target=target)
            mlp_score, _ = MulticlassMLPClassifier.compute(
                real_data, synthetic_data, dtypes=dtypes.copy(), target=target)

            scores.extend([tree_score, mlp_score])
            ml_efficacy_score = sum(scores) / len(scores)

            self._external_eval["ml_efficacy_scores"]["tree"].append(
                tree_score)
            self._external_eval["ml_efficacy_scores"]["mlp"].append(mlp_score)
            self._external_eval["ml_efficacy_scores"]["ml_efficacy"].append(
                ml_efficacy_score)

            if verbose:
                print("Tree score: " + str(tree_score))
                print("Mlp score: " + str(mlp_score))
                print("ML efficay score: " + str(ml_efficacy_score))

        return ml_efficacy_score

    def compute_correlation_score(self,
                                  synthetic_data,
                                  real_data,
                                  categorical_cols,
                                  verbose=True):
        table_eval = TableEvaluator(real_data,
                                    synthetic_data,
                                    cat_cols=categorical_cols,
                                    verbose=False)
        correlation_score = table_eval.correlation_distance(how='rmse')

        if verbose:
            print("Rmse correlation: " + str(correlation_score))

        return correlation_score

    def get_metadata(self):
        meta_data = {}
        data_info = {}
        categorical_cols = []
        dtypes = []
        dtypes_mapping = {}
        for index, column in enumerate(
                self._transformer._column_transform_info_list):
            name = column[0]
            data_type = column[1]
            if data_type == "discrete":
                data_info[name] = {"type": "categorical"}
                categorical_cols.append(name)
                dtypes.append("object")
                dtypes_mapping[name] = "object"
            else:
                if self._transformer._column_raw_dtypes._selected_obj[
                        index] == "int64":
                    data_info[name] = {
                        "type": "numerical",
                        "subtype": "integer"
                    }
                    dtypes.append(int)
                    dtypes_mapping[name] = int
                else:
                    data_info[name] = {"type": "numerical", "subtype": "float"}
                    dtypes.append(np.float64)
                    dtypes_mapping[name] = np.float64
        meta_data["tables"] = {None: {"fields": data_info}}

        return meta_data, categorical_cols, dtypes, dtypes_mapping

    def set_device(self, device):
        self._device = device
        if hasattr(self, '_generator'):
            self._generator.to(self._device)
        if hasattr(self, '_discriminator'):
            self._discriminator.to(self._device)
コード例 #5
0
ファイル: tvae.py プロジェクト: sdv-dev/CTGAN
class TVAESynthesizer(BaseSynthesizer):
    """TVAESynthesizer."""
    def __init__(self,
                 embedding_dim=128,
                 compress_dims=(128, 128),
                 decompress_dims=(128, 128),
                 l2scale=1e-5,
                 batch_size=500,
                 epochs=300,
                 loss_factor=2,
                 cuda=True):

        self.embedding_dim = embedding_dim
        self.compress_dims = compress_dims
        self.decompress_dims = decompress_dims

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.loss_factor = loss_factor
        self.epochs = epochs

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

    @random_state
    def fit(self, train_data, discrete_columns=()):
        """Fit the TVAE Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)
        dataset = TensorDataset(
            torch.from_numpy(train_data.astype('float32')).to(self._device))
        loader = DataLoader(dataset,
                            batch_size=self.batch_size,
                            shuffle=True,
                            drop_last=False)

        data_dim = self.transformer.output_dimensions
        encoder = Encoder(data_dim, self.compress_dims,
                          self.embedding_dim).to(self._device)
        self.decoder = Decoder(self.embedding_dim, self.decompress_dims,
                               data_dim).to(self._device)
        optimizerAE = Adam(list(encoder.parameters()) +
                           list(self.decoder.parameters()),
                           weight_decay=self.l2scale)

        for i in range(self.epochs):
            for id_, data in enumerate(loader):
                optimizerAE.zero_grad()
                real = data[0].to(self._device)
                mu, std, logvar = encoder(real)
                eps = torch.randn_like(std)
                emb = eps * std + mu
                rec, sigmas = self.decoder(emb)
                loss_1, loss_2 = _loss_function(
                    rec, real, sigmas, mu, logvar,
                    self.transformer.output_info_list, self.loss_factor)
                loss = loss_1 + loss_2
                loss.backward()
                optimizerAE.step()
                self.decoder.sigma.data.clamp_(0.01, 1.0)

    @random_state
    def sample(self, samples):
        """Sample data similar to the training data.

        Args:
            samples (int):
                Number of rows to sample.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        self.decoder.eval()

        steps = samples // self.batch_size + 1
        data = []
        for _ in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            noise = torch.normal(mean=mean, std=std).to(self._device)
            fake, sigmas = self.decoder(noise)
            fake = torch.tanh(fake)
            data.append(fake.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:samples]
        return self.transformer.inverse_transform(
            data,
            sigmas.detach().cpu().numpy())

    def set_device(self, device):
        """Set the `device` to be used ('GPU' or 'CPU)."""
        self._device = device
        self.decoder.to(self._device)
コード例 #6
0
ファイル: patectgan.py プロジェクト: opendp/smartnoise-sdk
class PATECTGAN(CTGANSynthesizer):
    def __init__(self,
                 embedding_dim=128,
                 generator_dim=(256, 256),
                 discriminator_dim=(256, 256),
                 generator_lr=2e-4,
                 generator_decay=1e-6,
                 discriminator_lr=2e-4,
                 discriminator_decay=1e-6,
                 batch_size=500,
                 discriminator_steps=1,
                 log_frequency=False,
                 verbose=False,
                 epochs=300,
                 pac=1,
                 cuda=True,
                 epsilon=1,
                 binary=False,
                 regularization=None,
                 loss="cross_entropy",
                 teacher_iters=5,
                 student_iters=5,
                 sample_per_teacher=1000,
                 delta=None,
                 noise_multiplier=1e-3,
                 moments_order=100,
                 category_epsilon_pct=0.1):

        assert batch_size % 2 == 0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self.pac = pac
        self.epsilon = epsilon

        self._category_epsilon_pct = category_epsilon_pct

        self.verbose = verbose
        self.loss = loss

        # PATE params
        self.regularization = regularization if self.loss != "wasserstein" else "dragan"
        self.teacher_iters = teacher_iters
        self.student_iters = student_iters
        self.pd_cols = None
        self.pd_index = None
        self.binary = binary
        self.sample_per_teacher = sample_per_teacher
        self.noise_multiplier = noise_multiplier
        self.moments_order = moments_order
        self.delta = delta

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

        if self._log_frequency:
            warnings.warn(
                "log_frequency is selected.  This may result in oversampling frequent "
                "categories, which could cause privacy leaks.")

    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None):
        if update_epsilon:
            self.epsilon = update_epsilon

        for col in categorical_columns:
            if str(data[col].dtype).startswith('float'):
                raise ValueError(
                    "It looks like you are passing in a vector of continuous values"
                    f"to a categorical column at [{col}]."
                    "Please discretize and pass in categorical columns with"
                    "unsigned integer or string category names.")

        sample_per_teacher = (self.sample_per_teacher
                              if self.sample_per_teacher < len(data) else 1000)
        self.num_teachers = int(len(data) / sample_per_teacher) + 1

        self._transformer = DataTransformer()
        self._transformer.fit(data, discrete_columns=categorical_columns)
        for tinfo in self._transformer._column_transform_info_list:
            if tinfo.column_type == "continuous":
                raise ValueError(
                    "We don't support continuous values on this synthesizer.  Please discretize values."
                )

        train_data = self._transformer.transform(data)

        data_partitions = np.array_split(train_data, self.num_teachers)

        data_dim = self._transformer.output_dimensions

        sampler_eps = 0.0
        if categorical_columns and self._category_epsilon_pct:
            sampler_eps = self.epsilon * self._category_epsilon_pct
            per_col_sampler_eps = sampler_eps / len(categorical_columns)
            self.epsilon = self.epsilon - sampler_eps
        else:
            per_col_sampler_eps = None

        self.cond_generator = DataSampler(
            train_data,
            self._transformer.output_info_list,
            self._log_frequency,
            per_column_epsilon=per_col_sampler_eps)

        spent = self.cond_generator.total_spent
        if (spent > sampler_eps and not np.isclose(spent, sampler_eps)):
            raise AssertionError(
                f"The data sampler used {spent} epsilon and was budgeted for {sampler_eps}"
            )
        # create conditional generator for each teacher model

        # Note: Previously, there existed a ConditionalGenerator object in CTGAN
        # - that functionality has been subsumed by DataSampler, but switch is
        # essentially 1 for 1
        # don't need to count eps for each teacher, because these are disjoint partitions
        cached_probs = self.cond_generator.discrete_column_category_prob

        cond_generator = [
            DataSampler(d,
                        self._transformer.output_info_list,
                        self._log_frequency,
                        per_column_epsilon=None,
                        discrete_column_category_prob=cached_probs)
            for d in data_partitions
        ]

        self._generator = Generator(
            self._embedding_dim + self.cond_generator.dim_cond_vec(),
            self._generator_dim, data_dim).to(self._device)

        discriminator = Discriminator(
            data_dim + self.cond_generator.dim_cond_vec(),
            self._discriminator_dim, self.loss, self.pac).to(self._device)

        student_disc = discriminator
        student_disc.apply(weights_init)

        teacher_disc = [discriminator for i in range(self.num_teachers)]
        for i in range(self.num_teachers):
            teacher_disc[i].apply(weights_init)

        optimizerG = optim.Adam(self._generator.parameters(),
                                lr=self._generator_lr,
                                betas=(0.5, 0.9),
                                weight_decay=self._generator_decay)

        optimizer_s = optim.Adam(student_disc.parameters(),
                                 lr=2e-4,
                                 betas=(0.5, 0.9))
        optimizer_t = [
            optim.Adam(teacher_disc[i].parameters(),
                       lr=self._discriminator_lr,
                       betas=(0.5, 0.9),
                       weight_decay=self._discriminator_decay)
            for i in range(self.num_teachers)
        ]

        noise_multiplier = self.noise_multiplier
        alphas = torch.tensor([0.0 for i in range(self.moments_order)],
                              device=self._device)
        l_list = 1 + torch.tensor(range(self.moments_order),
                                  device=self._device)
        eps = torch.zeros(1)

        mean = torch.zeros(self._batch_size,
                           self._embedding_dim,
                           device=self._device)
        std = mean + 1

        real_label = 1
        fake_label = 0

        criterion = nn.BCELoss() if (self.loss
                                     == "cross_entropy") else self.w_loss

        if self.verbose:
            print("using loss {} and regularization {}".format(
                self.loss, self.regularization))

        iteration = 0

        if self.delta is None:
            self.delta = 1 / (train_data.shape[0] *
                              np.sqrt(train_data.shape[0]))

        while eps.item() < self.epsilon:
            iteration += 1

            eps = min((alphas - math.log(self.delta)) / l_list)

            if eps.item() > self.epsilon:
                if iteration == 1:
                    raise ValueError(
                        "Inputted epsilon parameter is too small to" +
                        " create a private dataset. Try increasing epsilon and rerunning."
                    )
                break

            # train teacher discriminators
            for t_2 in range(self.teacher_iters):
                for i in range(self.num_teachers):
                    partition_data = data_partitions[i]
                    data_sampler = DataSampler(
                        partition_data,
                        self._transformer.output_info_list,
                        self._log_frequency,
                        per_column_epsilon=None,
                        discrete_column_category_prob=cached_probs)
                    fakez = torch.normal(mean, std=std).to(self._device)

                    condvec = cond_generator[i].sample_condvec(
                        self._batch_size)

                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = data_sampler.sample_data(
                            self._batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self._device)
                        m1 = torch.from_numpy(m1).to(self._device)
                        fakez = torch.cat([fakez, c1], dim=1)
                        perm = np.arange(self._batch_size)
                        np.random.shuffle(perm)
                        real = data_sampler.sample_data(
                            self._batch_size, col[perm], opt[perm])
                        c2 = c1[perm]

                    fake = self._generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype("float32")).to(
                        self._device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fake

                    optimizer_t[i].zero_grad()

                    y_all = torch.cat(
                        [teacher_disc[i](fake_cat), teacher_disc[i](real_cat)])
                    label_fake = torch.full(
                        (int(self._batch_size / self.pac), 1),
                        fake_label,
                        dtype=torch.float,
                        device=self._device,
                    )
                    label_true = torch.full(
                        (int(self._batch_size / self.pac), 1),
                        real_label,
                        dtype=torch.float,
                        device=self._device,
                    )
                    labels = torch.cat([label_fake, label_true])

                    error_d = criterion(y_all.squeeze(), labels.squeeze())
                    error_d.backward()

                    if self.regularization == "dragan":
                        pen = teacher_disc[i].dragan_penalty(
                            real_cat, device=self._device)
                        pen.backward(retain_graph=True)

                    optimizer_t[i].step()
            ###
            # train student discriminator
            for t_3 in range(self.student_iters):
                data_sampler = DataSampler(
                    train_data,
                    self._transformer.output_info_list,
                    self._log_frequency,
                    per_column_epsilon=None,
                    discrete_column_category_prob=cached_probs)
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample_condvec(self._batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample_data(self._batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self._device)
                    m1 = torch.from_numpy(m1).to(self._device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self._batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample_data(self._batch_size,
                                                    col[perm], opt[perm])
                    c2 = c1[perm]

                fake = self._generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                else:
                    fake_cat = fakeact

                fake_data = fake_cat

                ###
                predictions, votes = pate(fake_data,
                                          teacher_disc,
                                          noise_multiplier,
                                          device=self._device)

                output = student_disc(fake_data.detach())

                # update moments accountant
                alphas = alphas + moments_acc(self.num_teachers,
                                              votes,
                                              noise_multiplier,
                                              l_list,
                                              device=self._device)

                loss_s = criterion(
                    output.squeeze(),
                    predictions.float().to(self._device).squeeze())

                optimizer_s.zero_grad()
                loss_s.backward()

                if self.regularization == "dragan":
                    vals = torch.cat([predictions, fake_data], axis=1)
                    ordered = vals[vals[:, 0].sort()[1]]
                    data_list = torch.split(
                        ordered,
                        predictions.shape[0] - int(predictions.sum().item()))
                    synth_cat = torch.cat(data_list[1:], axis=0)[:, 1:]
                    pen = student_disc.dragan_penalty(synth_cat,
                                                      device=self._device)
                    pen.backward(retain_graph=True)

                optimizer_s.step()

                # print ('iterator {i}, student discriminator loss is {j}'.format(i=t_3, j=loss_s))

            # train generator
            fakez = torch.normal(mean=mean, std=std)
            condvec = self.cond_generator.sample_condvec(self._batch_size)

            if condvec is None:
                c1, m1, col, opt = None, None, None, None
            else:
                c1, m1, col, opt = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                m1 = torch.from_numpy(m1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)

            if c1 is not None:
                y_fake = student_disc(torch.cat([fakeact, c1], dim=1))
            else:
                y_fake = student_disc(fakeact)

            if condvec is None:
                cross_entropy = 0
            else:
                cross_entropy = self._cond_loss(fake, c1, m1)

            if self.loss == "cross_entropy":
                label_g = torch.full(
                    (int(self._batch_size / self.pac), 1),
                    real_label,
                    dtype=torch.float,
                    device=self._device,
                )
                loss_g = criterion(y_fake.squeeze(), label_g.float().squeeze())
                loss_g = loss_g + cross_entropy
            else:
                loss_g = -torch.mean(y_fake) + cross_entropy

            optimizerG.zero_grad()
            loss_g.backward()
            optimizerG.step()

            if self.verbose:
                print("eps: {:f} \t G: {:f} \t D: {:f}".format(
                    eps,
                    loss_g.detach().cpu(),
                    loss_s.detach().cpu()))

    def w_loss(self, output, labels):
        vals = torch.cat([labels[None, :], output[None, :]], axis=1)
        ordered = vals[vals[:, 0].sort()[1]]
        data_list = torch.split(ordered,
                                labels.shape[0] - int(labels.sum().item()))
        fake_score = data_list[0][:, 1]
        true_score = torch.cat(data_list[1:], axis=0)[:, 1]
        w_loss = -(torch.mean(true_score) - torch.mean(fake_score))
        return w_loss

    def generate(self, n, condition_column=None, condition_value=None):
        """
        TODO: Add condition_column support from CTGAN
        """
        self._generator.eval()

        # output_info = self._transformer.output_info
        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)

            condvec = self.cond_generator.sample_original_condvec(
                self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)