Ejemplo n.º 1
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """
    def __init__(self,
                 demand_col,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.demand_column = demand_col

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(
                    functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(data[:, st:ed],
                                               torch.argmax(c[:, st_c:ed_c],
                                                            dim=1),
                                               reduction='none')
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def plot_grad_flow(self, named_parameters, axis, plot_title):
        '''Plots the gradients flowing through different layers in the net during training.
        Can be used for checking for possible gradient vanishing / exploding problems.

        Usage: Plug this function in Trainer class after loss.backwards() as
        "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
        ave_grads = []
        max_grads = []
        layers = []
        for n, p in named_parameters:
            if (p.requires_grad) and ("bias" not in n):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
        axis.bar(np.arange(len(max_grads)),
                 max_grads,
                 alpha=0.1,
                 lw=1,
                 color="c")
        axis.bar(np.arange(len(max_grads)),
                 ave_grads,
                 alpha=0.1,
                 lw=1,
                 color="b")
        axis.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k")
        axis.set_xticklabels(layers, rotation="vertical")
        axis.set_xlim(left=0, right=len(ave_grads))
        axis.set_ylim(bottom=-0.001,
                      top=1.)  # zoom in on the lower gradient regions
        axis.set_xlabel("Layers")
        axis.set_ylabel("average gradient")
        axis.set_title(plot_title)
        plt.grid(True)
        axis.legend([
            Line2D([0], [0], color="c", lw=4),
            Line2D([0], [0], color="b", lw=4),
            Line2D([0], [0], color="k", lw=4)
        ], ['max-gradient', 'mean-gradient', 'zero-gradient'],
                    loc='upper left')

    def fit(self,
            train_data,
            eval_interval,
            eval,
            discrete_columns=tuple(),
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        train = train_data
        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        self.discriminator = Discriminator(
            data_dim + self.cond_generator.n_opt, self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(self.discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        # figure for plotting gradients; ax1 for generator and ax2 for discriminator
        #fig, (ax1, ax2) = plt.subplots(1, 2)

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = self.discriminator(fake_cat)
                y_real = self.discriminator(real_cat)

                pen = self.discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()

                #plot gradients
                #self.plot_grad_flow(self.discriminator.named_parameters(), ax2, 'Gradient flow for D')

                optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = self.discriminator(torch.cat([fakeact, c1],
                                                          dim=1))
                else:
                    y_fake = self.discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()

                # plot gradients
                #self.plot_grad_flow(self.generator.named_parameters(), ax1, 'Gradient flow for G')

                optimizerG.step()

            print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                  (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
                  flush=True)

            #check model results every x epochs
            #fig2, ax3 = plt.subplots(1)

            if eval:
                stats_real = train[self.demand_column].describe()
                stats_real_week = train.groupby('Weekday')[
                    self.demand_column].describe()
                stats_real_month = train.groupby('Month')[
                    self.demand_column].describe()

                if ((i + 1) % eval_interval == 0):
                    eval_sample = self.sample(1000)
                    sample = pd.DataFrame(eval_sample,
                                          columns=eval_sample.columns)
                    #sample.loc[:, self.demand_column].hist(bins=50, alpha=0.4, label='fake')
                    #ax3.hist(pd.DataFrame(train, columns=train.columns).loc[:, self.demand_column], bins=50, alpha=0.4, label='real')
                    #fig2.legend()
                    #fig2.show()

                    print(
                        (sample[self.demand_column].describe() - stats_real) /
                        stats_real)
                    print(' ')
                    print(((sample.groupby('Weekday')[
                        self.demand_column].describe() - stats_real_week) /
                           stats_real_week).T)
                    print(' ')
                    print(((sample.groupby('Month')[
                        self.demand_column].describe() - stats_real_month) /
                           stats_real_month).T)

        plt.show()

    def sample(self, n, condition=None, seed=0):
        """Sample data similar to the training data.

        Args:
            n (int):
                Number of rows to sample.
            condvec (Tuple):
                Number of column and value of condition to sample
            seed (int):
                Seed to create result reproducibility

        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        self.generator.eval()
        steps = n // self.batch_size + 1
        data = []
        np.random.seed(seed)
        torch.manual_seed(seed)
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            if condition is None:
                #select random condition
                condvec = self.cond_generator.sample_zero(self.batch_size)

            else:

                condvec = self.cond_generator.sample_condition(
                    self.batch_size, condition, self.transformer)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        # sets model back to training mode
        self.generator.train()

        return self.transformer.inverse_transform(data, None)
Ejemplo n.º 2
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Resiudal Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500,
                 gen_lr=2e-4,
                 dis_lr=2e-4,
                 verbose=False):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.gen_lr = gen_lr
        self.dis_lr = dis_lr
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.verbose = verbose

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                weights = functional.gumbel_softmax(data[:, st:ed], tau=0.2)
                while np.isnan(torch.sum(weights).cpu().detach().numpy()):
                    weights = functional.gumbel_softmax(data[:, st:ed],
                                                        tau=0.2)
                data_t.append(weights)
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(data[:, st:ed],
                                               torch.argmax(c[:, st_c:ed_c],
                                                            dim=1),
                                               reduction='none')
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(self,
            train_data,
            discrete_columns=tuple(),
            epochs=300,
            log_frequency=True):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """
        self.discrete_columns = discrete_columns
        # Fit OH encoder
        self.oe = OneHotEncoder(sparse=False)
        if discrete_columns:
            if type(train_data) is np.ndarray:
                self.oe.fit(train_data[:, discrete_columns])
                self.cat_column_idxes = discrete_columns
            else:
                self.oe.fit(train_data[discrete_columns])
                features = train_data.columns
                self.cat_column_idxes = [
                    features.index(c) for c in discrete_columns
                ]
        else:
            self.cat_column_idxes = []

        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, log_frequency)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(), lr=self.gen_lr)
        optimizerD = optim.Adam(
            discriminator.parameters(),
            lr=self.dis_lr,
        )

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in tqdm(range(epochs)):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = discriminator(fake_cat)
                y_real = discriminator(real_cat)

                pen = discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(nanmean(y_real) - nanmean(y_fake))

                optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))

                else:
                    y_fake = discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -nanmean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

        if self.verbose:
            print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                  (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
                  flush=True)

        self.discriminator = discriminator

    def sample(self, n, test_instance=None):
        """Sample data similar to the training data.

        Args:
            n (int):
                Number of rows to sample.
            test_instance (Optional[np.ndarray]):
                Test sample to condition sampler with

        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            if test_instance is not None and len(self.cat_column_idxes) > 0:
                if type(test_instance) is pd.DataFrame:
                    test_instance = test_instance.values
                test_cat_onehot = self.oe.transform(
                    test_instance[:, self.cat_column_idxes]).ravel()
                nonzeroes = np.nonzero(test_cat_onehot)[0]
                samples = np.random.choice(
                    nonzeroes, size=self.batch_size).astype(np.int8)
                condvec = np.zeros(
                    (self.batch_size, self.cond_generator.n_opt))
                condvec[list(range(self.batch_size)), samples] = 1
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).type(torch.FloatTensor).to(
                    self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)
Ejemplo n.º 3
0
class DPCTGAN(CTGANSynthesizer):
    """Differential Private Conditional Table GAN Synthesizer
    This code adds Differential Privacy to CTGANSynthesizer from https://github.com/sdv-dev/CTGAN
    """
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500,
                 epochs=300,
                 pack=1,
                 log_frequency=True,
                 disabled_dp=False,
                 target_delta=None,
                 sigma=5,
                 max_per_sample_grad_norm=1.0,
                 epsilon=1,
                 verbose=True,
                 loss='cross_entropy'):

        # CTGAN model specific parameters
        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.epochs = epochs
        self.pack = pack
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        # opacus parameters
        self.sigma = sigma
        self.disabled_dp = disabled_dp
        self.target_delta = target_delta
        self.max_per_sample_grad_norm = max_per_sample_grad_norm
        self.epsilon = epsilon
        self.epsilon_list = []
        self.alpha_list = []
        self.loss_d_list = []
        self.loss_g_list = []
        self.verbose = verbose
        self.loss = loss

        if self.loss != "cross_entropy":
            # Monkeypatches the _create_or_extend_grad_sample function when calling opacus
            opacus.supported_layers_grad_samplers._create_or_extend_grad_sample = _custom_create_or_extend_grad_sample

    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None):
        if update_epsilon:
            self.epsilon = update_epsilon

        self.transformer = DataTransformer()
        self.transformer.fit(data, discrete_columns=categorical_columns)
        train_data = self.transformer.transform(data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions
        self.cond_generator = ConditionalGenerator(
            train_data, self.transformer.output_info, self.log_frequency)

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim, self.loss,
                                      self.pack).to(self.device)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerD = optim.Adam(discriminator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))

        privacy_engine = opacus.PrivacyEngine(
            discriminator,
            batch_size=self.batch_size,
            sample_size=train_data.shape[0],
            alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)),
            noise_multiplier=self.sigma,
            max_grad_norm=self.max_per_sample_grad_norm,
            clip_per_layer=True)

        if not self.disabled_dp:
            privacy_engine.attach(optimizerD)

        one = torch.tensor(1, dtype=torch.float).to(self.device)
        mone = one * -1

        REAL_LABEL = 1
        FAKE_LABEL = 0
        criterion = nn.BCELoss()

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = len(train_data) // self.batch_size
        for i in range(self.epochs):
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                optimizerD.zero_grad()

                if self.loss == 'cross_entropy':
                    y_fake = discriminator(fake_cat)

                    #   print ('y_fake is {}'.format(y_fake))
                    label_fake = torch.full(
                        (int(self.batch_size / self.pack), ),
                        FAKE_LABEL,
                        dtype=torch.float,
                        device=self.device)

                    #    print ('label_fake is {}'.format(label_fake))

                    errD_fake = criterion(y_fake, label_fake)
                    errD_fake.backward()
                    optimizerD.step()

                    # train with real
                    label_true = torch.full(
                        (int(self.batch_size / self.pack), ),
                        REAL_LABEL,
                        dtype=torch.float,
                        device=self.device)
                    y_real = discriminator(real_cat)
                    errD_real = criterion(y_real, label_true)
                    errD_real.backward()
                    optimizerD.step()

                    loss_d = errD_real + errD_fake

                else:

                    y_fake = discriminator(fake_cat)
                    mean_fake = torch.mean(y_fake)
                    mean_fake.backward(one)

                    y_real = discriminator(real_cat)
                    mean_real = torch.mean(y_real)
                    mean_real.backward(mone)

                    optimizerD.step()

                    loss_d = -(mean_real - mean_fake)

                max_grad_norm = []
                for p in discriminator.parameters():
                    param_norm = p.grad.data.norm(2).item()
                    max_grad_norm.append(param_norm)
                #pen = calc_gradient_penalty(discriminator, real_cat, fake_cat, self.device)

                #pen.backward(retain_graph=True)
                #loss_d.backward()
                #optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = discriminator(fakeact)

                #if condvec is None:
                cross_entropy = 0
                #else:
                #    cross_entropy = self._cond_loss(fake, c1, m1)

                if self.loss == 'cross_entropy':
                    label_g = torch.full((int(self.batch_size / self.pack), ),
                                         REAL_LABEL,
                                         dtype=torch.float,
                                         device=self.device)
                    #label_g = torch.full(int(self.batch_size/self.pack,),1,device=self.device)
                    loss_g = criterion(y_fake, label_g)
                    loss_g = loss_g + cross_entropy
                else:
                    loss_g = -torch.mean(y_fake) + cross_entropy

                optimizerG.zero_grad()
                loss_g.backward()
                optimizerG.step()

                if not self.disabled_dp:
                    #if self.loss == 'cross_entropy':
                    #    autograd_grad_sample.clear_backprops(discriminator)
                    #else:
                    for p in discriminator.parameters():
                        if hasattr(p, "grad_sample"):
                            del p.grad_sample

                    if self.target_delta is None:
                        self.target_delta = 1 / train_data.shape[0]

                    epsilon, best_alpha = optimizerD.privacy_engine.get_privacy_spent(
                        self.target_delta)

                    self.epsilon_list.append(epsilon)
                    self.alpha_list.append(best_alpha)
                    #if self.verbose:

            if not self.disabled_dp:
                if self.epsilon < epsilon:
                    break
            self.loss_d_list.append(loss_d)
            self.loss_g_list.append(loss_g)
            if self.verbose:
                print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                      (i + 1, loss_g.detach().cpu(), loss_d.detach().cpu()),
                      flush=True)
                print('epsilon is {e}, alpha is {a}'.format(e=epsilon,
                                                            a=best_alpha))

        return self.loss_d_list, self.loss_g_list, self.epsilon_list, self.alpha_list

    def generate(self, n):
        self.generator.eval()

        #output_info = self.transformer.output_info
        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]
        return self.transformer.inverse_transform(data, None)
Ejemplo n.º 4
0
class PATECTGAN(CTGANSynthesizer):
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 epochs=300,
                 pack=1,
                 log_frequency=True,
                 disabled_dp=False,
                 target_delta=None,
                 sigma=5,
                 max_per_sample_grad_norm=1.0,
                 verbose=True,
                 loss='cross_entropy',
                 binary=False,
                 batch_size=500,
                 teacher_iters=5,
                 student_iters=5,
                 epsilon=1.0,
                 delta=1e-5):

        # CTGAN model specifi3c parameters
        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.epochs = epochs
        self.pack = pack
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.verbose = verbose
        self.loss = loss

        self.binary = binary
        self.batch_size = batch_size
        self.teacher_iters = teacher_iters
        self.student_iters = student_iters
        self.epsilon = epsilon
        self.delta = delta
        self.pd_cols = None
        self.pd_index = None

    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None):
        if update_epsilon:
            self.epsilon = update_epsilon

        # this is to make sure data has at least 1000 points, may need it become flexible
        self.num_teachers = int(len(data) / 5000) + 1
        self.transformer = DataTransformer()
        self.transformer.fit(data, discrete_columns=categorical_columns)
        data = self.transformer.transform(data)
        data_partitions = np.array_split(data, self.num_teachers)

        data_dim = self.transformer.output_dimensions

        self.cond_generator = ConditionalGenerator(
            data, self.transformer.output_info, self.log_frequency)

        # create conditional generator for each teacher model
        cond_generator = [
            ConditionalGenerator(d, self.transformer.output_info,
                                 self.log_frequency) for d in data_partitions
        ]

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim, self.loss,
                                      self.pack).to(self.device)

        student_disc = discriminator
        student_disc.apply(weights_init)

        teacher_disc = [discriminator for i in range(self.num_teachers)]
        for i in range(self.num_teachers):
            teacher_disc[i].apply(weights_init)

        optimizerG = optim.Adam(self.generator.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9),
                                weight_decay=self.l2scale)
        optimizerS = optim.Adam(student_disc.parameters(),
                                lr=2e-4,
                                betas=(0.5, 0.9))
        optimizerT = [
            optim.Adam(teacher_disc[i].parameters(), lr=2e-4, betas=(0.5, 0.9))
            for i in range(self.num_teachers)
        ]

        criterion = nn.BCELoss()

        noise_multiplier = 1e-3
        alphas = torch.tensor([0.0 for i in range(100)])
        l_list = 1 + torch.tensor(range(100))
        eps = 0

        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        REAL_LABEL = 1
        FAKE_LABEL = 0
        criterion = nn.BCELoss()

        while eps < self.epsilon:
            # train teacher discriminators
            for t_2 in range(self.teacher_iters):
                for i in range(self.num_teachers):

                    # print ('i is {}'.format(i))

                    partition_data = data_partitions[i]
                    data_sampler = Sampler(partition_data,
                                           self.transformer.output_info)
                    fakez = torch.normal(mean, std=std)

                    condvec = cond_generator[i].sample(self.batch_size)

                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = data_sampler.sample(self.batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self.device)
                        m1 = torch.from_numpy(m1).to(self.device)
                        fakez = torch.cat([fakez, c1], dim=1)
                        perm = np.arange(self.batch_size)
                        np.random.shuffle(perm)
                        real = data_sampler.sample(self.batch_size, col[perm],
                                                   opt[perm])
                        c2 = c1[perm]

                    fake = self.generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype('float32')).to(
                        self.device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fake

                    optimizerT[i].zero_grad()

                    if self.loss == 'cross_entropy':
                        y_fake = teacher_disc[i](fake_cat)

                        label_fake = torch.full(
                            (int(self.batch_size / self.pack), ),
                            FAKE_LABEL,
                            dtype=torch.float,
                            device=self.device)
                        errD_fake = criterion(y_fake, label_fake.float())
                        errD_fake.backward()
                        optimizerT[i].step()

                        # train with real
                        label_true = torch.full(
                            (int(self.batch_size / self.pack), ),
                            REAL_LABEL,
                            dtype=torch.float,
                            device=self.device)
                        y_real = teacher_disc[i](real_cat)
                        errD_real = criterion(y_real, label_true.float())
                        errD_real.backward()
                        optimizerT[i].step()

                        loss_d = errD_real + errD_fake

                    # print("Iterator is {i},  Loss D for teacher {n} is :{j}".format(i=t_2 + 1, n=i+1,  j=loss_d.detach().cpu()))

            # train student discriminator
            for t_3 in range(self.student_iters):
                data_sampler = Sampler(data, self.transformer.output_info)
                fakez = torch.normal(mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                else:
                    fake_cat = fake

                fake_data = fake_cat
                predictions, votes = pate(fake_data, teacher_disc,
                                          noise_multiplier)

                output = student_disc(fake_data.detach())

                # update moments accountant
                alphas = alphas + moments_acc(self.num_teachers, votes,
                                              noise_multiplier, l_list)

                loss_s = criterion(output, predictions.float().to(self.device))

                optimizerS.zero_grad()
                loss_s.backward()
                optimizerS.step()

                # print ('iterator {i}, student discriminator loss is {j}'.format(i=t_3, j=loss_s))

            # train generator
            fakez = torch.normal(mean=mean, std=std)
            condvec = self.cond_generator.sample(self.batch_size)

            if condvec is None:
                c1, m1, col, opt = None, None, None, None
            else:
                c1, m1, col, opt = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                m1 = torch.from_numpy(m1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)

            if c1 is not None:
                y_fake = student_disc(torch.cat([fakeact, c1], dim=1))
            else:
                y_fake = student_disc(fakeact)

        # if condvec is None:
            cross_entropy = 0
            #else:
            #    cross_entropy = self._cond_loss(fake, c1, m1)

            if self.loss == 'cross_entropy':
                label_g = torch.full((int(self.batch_size / self.pack), ),
                                     REAL_LABEL,
                                     dtype=torch.float,
                                     device=self.device)
                #label_g = torch.full(int(self.batch_size/self.pack,),1,device=self.device)
                loss_g = criterion(y_fake, label_g.float())
                loss_g = loss_g + cross_entropy
            else:
                loss_g = -torch.mean(y_fake) + cross_entropy

            optimizerG.zero_grad()
            loss_g.backward()
            optimizerG.step()

            print('generator is {}'.format(loss_g.detach().cpu()))

            eps = min((alphas - math.log(self.delta)) / l_list)

    def generate(self, n):
        self.generator.eval()

        steps = n // self.batch_size + 1

        data = []

        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        generated_data = self.transformer.inverse_transform(data, None)

        return generated_data
Ejemplo n.º 5
0
class CTGAN(object):
    """Conditional Table GAN Synthesizer.
    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.
    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Wheight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
    """
    def __init__(self,
                 embedding_dim=128,
                 gen_dim=(256, 256),
                 dis_dim=(256, 256),
                 l2scale=1e-6,
                 batch_size=500,
                 epochs=100):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.trained_epoches = 0
        self.epochs = epochs

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(
                    functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(data[:, st:ed],
                                               torch.argmax(c[:, st_c:ed_c],
                                                            dim=1),
                                               reduction='none')
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def train(self,
              train_data,
              categorical_columns=tuple(),
              epochs=-1,
              log_frequency=True,
              ordinal_columns=None,
              update_epsilon=None,
              verbose=False,
              mlflow=False):
        """Fit the CTGAN Synthesizer models to the training data.
        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to init param.
            log_frequency (boolean):
                Whether to use log frequency of categorical levels in conditional
                sampling. Defaults to ``True``.
        """

        if (epochs == -1):
            epochs = self.epochs
        if not hasattr(self, "transformer"):
            self.transformer = DataTransformer()
            self.transformer.fit(train_data, categorical_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions

        if not hasattr(self, "cond_generator"):
            self.cond_generator = ConditionalGenerator(
                train_data, self.transformer.output_info, log_frequency)

        if not hasattr(self, "generator"):
            self.generator = Generator(
                self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
                data_dim).to(self.device)

        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                data_dim + self.cond_generator.n_opt,
                self.dis_dim).to(self.device)

        if not hasattr(self, "optimizerG"):
            self.optimizerG = optim.Adam(self.generator.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9),
                                         weight_decay=self.l2scale)

        if not hasattr(self, "optimizerD"):
            self.optimizerD = optim.Adam(self.discriminator.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            self.trained_epoches += 1
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = self.discriminator(fake_cat)
                y_real = self.discriminator(real_cat)

                pen = self.discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                self.optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                self.optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = self.discriminator(torch.cat([fakeact, c1],
                                                          dim=1))
                else:
                    y_fake = self.discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                self.optimizerG.zero_grad()
                loss_g.backward()
                self.optimizerG.step()

            if (verbose):
                if (i % 50 == 0):
                    print("Epoch %d, Loss G: %.4f, Loss D: %.4f" %
                          (self.trained_epoches - 1, loss_g.detach().cpu(),
                           loss_d.detach().cpu()),
                          flush=True)

            if (mlflow):
                import mlflow
                mlflow.log_metric("loss_d",
                                  float(loss_d.detach().cpu()),
                                  step=self.trained_epoches - 1)
                mlflow.log_metric("loss_g",
                                  float(loss_g.detach().cpu()),
                                  step=self.trained_epoches - 1)

    def generate(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.
        Args:
            n (int):
                Number of rows to sample.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self.cond_generator.generate_cond_from_condition_column_info(
                condition_info, self.batch_size)
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)

    def save(self, path):
        assert hasattr(self, "generator")
        assert hasattr(self, "discriminator")
        assert hasattr(self, "transformer")

        # always save a cpu model.
        device_bak = self.device
        self.device = torch.device("cpu")
        self.generator.to(self.device)
        self.discriminator.to(self.device)

        torch.save(self, path)

        self.device = device_bak
        self.generator.to(self.device)
        self.discriminator.to(self.device)

    @classmethod
    def load(cls, path):
        model = torch.load(path)
        model.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        model.generator.to(model.device)
        model.discriminator.to(model.device)
        return model
Ejemplo n.º 6
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.
    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.
    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Weight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        blackbox_model:
            Model that implements fit, predict, predict_proba
    """
    def __init__(
            self,
            embedding_dim=128,
            gen_dim=(256, 256),
            dis_dim=(256, 256),
            l2scale=1e-6,
            batch_size=500,
            discriminator_steps=1,
            log_frequency=True,
            blackbox_model=None,
            preprocessing_pipeline=None,
            bb_loss="logloss",
            confidence_levels=[],  #default value if no confidence given
    ):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.trained_epoches = 0
        self.discriminator_steps = discriminator_steps
        self.blackbox_model = blackbox_model
        self.preprocessing_pipeline = preprocessing_pipeline
        self.confidence_levels = confidence_levels  #set here not in fit
        self.bb_loss = bb_loss
        #print("self.confidence_levels")

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.
        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing
        Args:
            logits:
                […, num_features] unnormalized log probabilities
            tau:
                non-negative scalar temperature
            hard:
                if True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                a dimension along which softmax will be computed. Default: -1.
        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """

        if version.parse(torch.__version__) < version.parse("1.2.0"):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits,
                                                        tau=tau,
                                                        hard=hard,
                                                        eps=eps,
                                                        dim=dim)
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError("gumbel_softmax returning NaN.")

        return functional.gumbel_softmax(logits,
                                         tau=tau,
                                         hard=hard,
                                         eps=eps,
                                         dim=dim)

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == "tanh":
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == "softmax":
                ed = st + item[0]
                transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                data_t.append(transformed)
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == "tanh":
                st += item[0]
                skip = True

            elif item[1] == "softmax":
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(
                    data[:, st:ed],
                    torch.argmax(c[:, st_c:ed_c], dim=1),
                    reduction="none",
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(
            self,
            train_data,
            discrete_columns=tuple(),
            epochs=300,
            verbose=True,
            gen_lr=2e-4,
    ):
        """Fit the CTGAN Synthesizer models to the training data.
        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
        """
        """
        self.confidence_level = confidence_level
        loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d"
        history = {"loss_g": [], loss_other_name: []}
        """
        # Eli: add Mode-specific Normalization
        if not hasattr(self, "transformer"):
            self.transformer = DataTransformer()
            self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions

        if not hasattr(self, "cond_generator"):
            self.cond_generator = ConditionalGenerator(
                train_data, self.transformer.output_info, self.log_frequency)

        if not hasattr(self, "generator"):
            self.generator = Generator(
                self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
                data_dim).to(self.device)
        #print(data_dim)
        #print(self.cond_generator.n_opt)

        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                data_dim + self.cond_generator.n_opt,
                self.dis_dim).to(self.device)
        """
        #after sample in fit gen_output is 120 not 80(cause gen allmost twice)
        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                24 + self.cond_generator.n_opt, self.dis_dim
            ).to(self.device)
        """
        if not hasattr(self, "optimizerG"):
            self.optimizerG = optim.Adam(
                self.generator.parameters(),
                lr=gen_lr,
                betas=(0.5, 0.9),
                weight_decay=self.l2scale,
            )

        if not hasattr(self, "optimizerD"):
            self.optimizerD = optim.Adam(self.discriminator.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0

        # init mean to zero and std to one
        #keep one spot to confidence level, which will be add after normal dis will
        mean = torch.zeros(((self.batch_size * self.embedding_dim) - 1),
                           device=self.device)
        std = mean + 1

        # steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        steps_per_epoch = 10  # magic number decided with Gilad. feel free to change it

        loss_other_name = "loss_bb" if self.confidence_levels != [] else "loss_d"
        allhist = {"confidence_levels_history": []}
        # Eli: start training loop
        for current_conf_level in self.confidence_levels:
            #need to change, if non confidence give, aka []) no loop will run
            #so if no conf, need to jump over conf loop
            history = {
                "confidence_level": current_conf_level,
                "loss_g": [],
                loss_other_name: []
            }
            for i in range(epochs):
                self.trained_epoches += 1
                for id_ in range(steps_per_epoch):
                    #we examine confidence lelvel so no need parts which they does not show up
                    """
                    if self.confidence_levels == []:
                        # discriminator loop
                        for n in range(self.discriminator_steps):
                            fakez = torch.normal(mean=mean, std=std)
                            condvec = self.cond_generator.sample(self.batch_size)
                            if condvec is None:
                                c1, m1, col, opt = None, None, None, None
                                real = data_sampler.sample(self.batch_size, col, opt)
                            else:
                                c1, m1, col, opt = condvec
                                c1 = torch.from_numpy(c1).to(self.device)
                                m1 = torch.from_numpy(m1).to(self.device)
                                fakez = torch.cat([fakez, c1], dim=1)
                                perm = np.arange(self.batch_size)
                                np.random.shuffle(perm)
                                real = data_sampler.sample(
                                    self.batch_size, col[perm], opt[perm]
                                )
                                c2 = c1[perm]
                            fake = self.generator(fakez)
                            fakeact = self._apply_activate(fake)
                            real = torch.from_numpy(real.astype("float32")).to(self.device)
                            if c1 is not None:
                                fake_cat = torch.cat([fakeact, c1], dim=1)
                                real_cat = torch.cat([real, c2], dim=1)
                            else:
                                real_cat = real
                                fake_cat = fake
                            y_fake = self.discriminator(fake_cat)
                            y_real = self.discriminator(real_cat)
                            pen = self.discriminator.calc_gradient_penalty(
                                real_cat, fake_cat, self.device
                            )
                            loss_d = -(torch.mean(y_real) - torch.mean(y_fake))
                            if self.confidence_levels == []:  # without bb loss
                                self.optimizerD.zero_grad()
                                pen.backward(retain_graph=True)
                                loss_d.backward()
                                self.optimizerD.step()
                    """
                    # we examine confidence lelvel so no need parts which they does not show up
                    #as above(no confidence and uses discriminator, no need)

                    fakez = torch.normal(mean=mean, std=std)
                    #added here the part of adding conf to sample
                    #not sure why we need this part in the code but debug shows it this usses this lines
                    #ask gilad eli
                    confi = current_conf_level.astype(
                        np.float32)  # generator excpect float
                    conf = torch.tensor([confi]).to(
                        self.device
                    )  # change conf to conf input that will sent!!
                    fakez = torch.cat([fakez, conf], dim=0)
                    fakez = torch.reshape(
                        fakez, (self.batch_size, self.embedding_dim))
                    #end added here the part of adding conf to sample

                    condvec = self.cond_generator.sample(self.batch_size)

                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self.device)
                        m1 = torch.from_numpy(m1).to(self.device)
                        fakez = torch.cat([fakez, c1], dim=1)

                    fake = self.generator(fakez)
                    fakeact = self._apply_activate(fake)

                    if c1 is not None:
                        y_fake = self.discriminator(
                            torch.cat([fakeact, c1], dim=1))
                    else:
                        y_fake = self.discriminator(fakeact)

                    if condvec is None:
                        cross_entropy = 0

                    else:
                        cross_entropy = self._cond_loss(fake, c1, m1)

                    if self.confidence_levels != []:
                        # generate `batch_size` samples
                        #samples of fit using yBB in its input
                        #apply generator twice
                        gen_out, gen_fakeacts = self.sample(
                            self.batch_size, current_conf_level)
                        """
                        gen_out=fakeact.detach().cpu().numpy()
                        gen_out=self.transformer.inverse_transform(gen_out, None)
                        """

                        loss_bb = self._calc_bb_confidence_loss(
                            gen_out, current_conf_level
                        )  #send specific confidence to loss computation
                        #return conf bit vector input and bb_y_vec input bit

                        #send to discriminate to connect gradient again
                        y_fake = self.discriminator(gen_fakeacts)

                        #find mean like in original ctgan
                        #multiple by small number to get realy small value
                        y_fake_mean = torch.mean(y_fake) * 0.00000000000001
                        y_fake_mean = y_fake_mean.to(self.device)

                        #change to float because y_fake is float not double
                        loss_bb = loss_bb.float()

                        #add y_fake_mean to lossg like in origin ctgan
                        #plus loss_bb
                        loss_g = -y_fake_mean + loss_bb + cross_entropy
                        #loss_g = -y_fake_mean + cross_entropy

                    else:  # original loss
                        loss_g = -torch.mean(y_fake) + cross_entropy

                    self.optimizerG.zero_grad()
                    loss_g.backward()
                    """
                    print("gradients\n")
                    
                    for p in self.generator.parameters():
                        print(p.grad)
                    """
                    """
                    for name, param in self.generator.named_parameters():
                        #if param.requires_grad:
                        print(name)
                        print(param.grad)
                    """
                    self.optimizerG.step()

                loss_g_val = loss_g.detach().cpu()
                loss_other_val = locals()[loss_other_name].detach().cpu()
                history["loss_g"].append(loss_g.item())
                history[loss_other_name].append(loss_other_val.item())

                if verbose:
                    print(
                        f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}",
                        flush=True,
                    )
            allhist["confidence_levels_history"].append(history)

        return allhist

    #special sample for fit/train part
    #extra bit added to input as y,apply gen twice
    #in second time with ybb after sent gen_out if first apply generator
    #first time with y zero bit second time with bb conf bit

    #not relevant function for now
    def fit_sample(self,
                   n,
                   confidence_level,
                   condition_column=None,
                   condition_value=None):
        """Sample data similar to the training data.
        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.
        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = (
                self.cond_generator.generate_cond_from_condition_column_info(
                    condition_info, self.batch_size))
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        gen_input_layer = self.generator.seq[0].fc  #get linearinpputlayer

        #now first sample part need to be
        #with out not gradients updates
        #so for all first part we do:
        with torch.no_grad():

            for i in range(steps):
                #check if it is okay decrease noise by one for adding conf
                #add conf ass input to sample function, as number vector
                #give one vector place to adding y to gen noise
                mean = torch.zeros(self.batch_size, (self.embedding_dim - 2))
                std = mean + 1
                fakez = torch.normal(mean=mean, std=std).to(self.device)
                #fakez = torch.reshape(fakez,(-1,))
                confidence_level = confidence_level.astype(
                    np.float32)  #generator excpect float
                #create C column vector of confidence level C
                conf = torch.zeros(self.batch_size, 1) + confidence_level
                conf = conf.to(
                    self.device)  #change conf to conf input that will sent!!

                #first sample will geet zero as y column vector
                yzero = torch.zeros(self.batch_size, 1).to(self.device)

                #adding y bit to fakez gen noise,generator input
                fakez = torch.cat([fakez, conf, yzero], dim=1)
                fakez = torch.reshape(fakez,
                                      (self.batch_size, self.embedding_dim))

                if global_condition_vec is not None:
                    condvec = global_condition_vec.copy()
                else:
                    condvec = self.cond_generator.sample_zero(self.batch_size)

                if condvec is None:
                    pass
                else:
                    c1 = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                #conftens = torch.tensor([0.5]);
                #fakez = torch.cat([fakez, conftens], dim=1)#check to delelte!!

                #reset to zero weights of y_zero and his bias
                gen_input_layer.weight[-1] = 0
                #-1 is last line, where I considered yzero weights to be
                #which contains all weights of last bit of input
                gen_input_layer.bias[-1] = 0
                #-1 is last bias,where I considered yzero weights to be
                #in biases vector which contain the bias of last bit

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)
                data.append(fakeact.detach().cpu().numpy())

            data = np.concatenate(data, axis=0)
            data = data[:n]
            # instead of return data
            # we will use it here to give it to BB and get new conf y
            gen_out = self.transformer.inverse_transform(data, None)

            y_prob_for_y_zero = self.blackbox_model.predict_proba(gen_out)
            y_conf_gen_for_y_zero = y_prob_for_y_zero[:, 0].astype(np.float32)

        #now we use this y conf and put it as y bit instead 0 and sample normally

        #######
        #second part of samples with BB y conf bit
        #######

        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = (
                self.cond_generator.generate_cond_from_condition_column_info(
                    condition_info, self.batch_size))
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            #check if it is okay decrease noise by one for adding conf
            #add conf ass input to sample function, as vector cloumn
            #decrease by one more for y vec (-2 for y bit a conf level in input)
            mean = torch.zeros(self.batch_size, (self.embedding_dim - 2))
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)
            #fakez = torch.reshape(fakez,(-1,))
            confidence_level = confidence_level.astype(
                np.float32)  #generator excpect float
            #conf = torch.tensor([confidence_level],requires_grad=True).to(self.device)#change conf to conf input that will sent!!
            conf = torch.zeros(self.batch_size, 1) + confidence_level
            #conf is target in BBCE loss function
            #target ccant have grdient descent True for some reason
            #conf.requires_grad = True
            conf.to(self.device)

            #fix y bb bit
            #y_conf_gen_for_y_zero = y_conf_gen_for_y_zero.astype(np.float32)#generator
            #y_BB_bit=torch.tensor([y_conf_gen_for_y_zero], requires_grad=True).to(self.device)
            #y_BB_column = torch.zeros(self.batch_size,1) + y_conf_gen_for_y_zero
            y_BB_column = torch.tensor([y_conf_gen_for_y_zero],
                                       requires_grad=True).to(self.device)
            #y_BB_column.requires_grad = True
            #take Transpose because shape is 1*50 - need 50*1
            y_BB_column = y_BB_column.T
            y_BB_column.to(self.device)

            #put y_bb bit in fakez gen noise ,generator input
            fakez = torch.cat([fakez, conf, y_BB_column], dim=1)
            fakez = torch.reshape(fakez, (self.batch_size, self.embedding_dim))
            #fakez = fakez.astype(np.float32)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            #conftens = torch.tensor([0.5]);
            #fakez = torch.cat([fakez, conftens], dim=1)#check to delelte!!

            #turn on input gradient
            #fakez.requires_grad = True
            #turn on inputs gradient

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]
        # return data
        #return self.transformer.inverse_transform(data, None)
        gen_out = self.transformer.inverse_transform(data, None)
        #ret conf level vector and bb_y_vector
        #check if return the below written or above written
        return conf, y_BB_column

    #normal sample for creating in the expirements(with no train)
    def sample(self,
               n,
               confidence_level,
               condition_column=None,
               condition_value=None):
        """Sample data similar to the training data.
        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.
        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.
        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = (
                self.cond_generator.generate_cond_from_condition_column_info(
                    condition_info, self.batch_size))
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        our_fackeacts = []
        for i in range(steps):
            #check if it is okay decrease noise by one for adding conf
            #add conf ass input to sample function, as number vector column
            mean = torch.zeros(self.batch_size, (self.embedding_dim - 1))
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)
            #fakez = torch.reshape(fakez,(-1,))
            confidence_level = confidence_level.astype(
                np.float32)  #generator excpect float
            #create C column vector of confidence level C
            conf = torch.zeros(self.batch_size, 1) + confidence_level
            conf = conf.to(
                self.device)  #change conf to conf input that will sent!!
            fakez = torch.cat([fakez, conf], dim=1)
            fakez = torch.reshape(fakez, (self.batch_size, self.embedding_dim))

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)

            fakeact = self._apply_activate(fake)

            our_fackeacts.append(fakeact)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)

        data = data[:n]
        gen_fakeacts = torch.cat(our_fackeacts, 0)

        return self.transformer.inverse_transform(data, None), gen_fakeacts

    def save(self, path):
        assert hasattr(self, "generator")
        assert hasattr(self, "discriminator")
        assert hasattr(self, "transformer")

        # always save a cpu model.
        device_bak = self.device
        self.device = torch.device("cpu")
        self.generator.to(self.device)
        self.discriminator.to(self.device)

        torch.save(self, path)

        self.device = device_bak
        self.generator.to(self.device)
        self.discriminator.to(self.device)

    @classmethod
    def load(cls, path):
        model = torch.load(path)
        model.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        model.generator.to(model.device)
        model.discriminator.to(model.device)
        return model

    def _calc_bb_confidence_loss(self, gen_out, conf_level):
        #added specific conf level as input arguement
        y_prob = self.blackbox_model.predict_proba(gen_out)
        y_conf_gen = y_prob[:, 0]  # confidence scores

        # create vector with the same size of y_confidence filled with `confidence_level` values
        if isinstance(conf_level, list):
            conf = np.random.choice(conf_level)
        else:
            conf = conf_level
        y_conf_wanted = np.full(len(y_conf_gen), conf)

        # to tensor
        y_conf_gen = torch.tensor(y_conf_gen,
                                  requires_grad=True).to(self.device)
        y_conf_wanted = torch.tensor(y_conf_wanted).to(self.device)

        # loss
        bb_loss = self._get_loss_by_name(self.bb_loss)
        bb_loss_val = bb_loss(y_conf_gen, y_conf_wanted)

        return bb_loss_val

    @staticmethod
    def _get_loss_by_name(loss_name):
        if loss_name == "log":
            return torch.nn.BCELoss()
        elif loss_name == "l1":
            return torch.nn.L1Loss()
        elif loss_name == "l2":
            return torch.nn.L1Loss()
        elif loss_name == "focal":
            return WeightedFocalLoss()
        else:
            raise ValueError(f"Unknown loss name '{loss_name}'")
Ejemplo n.º 7
0
class PATECTGAN(CTGANSynthesizer):
    def __init__(
            self,
            embedding_dim=128,
            gen_dim=(256, 256),
            dis_dim=(256, 256),
            l2scale=1e-6,
            epochs=300,
            pack=1,
            log_frequency=True,
            disabled_dp=False,
            target_delta=None,
            sigma=5,
            max_per_sample_grad_norm=1.0,
            loss='cross_entropy',
            regularization=None,  #regularizations supported: 'dragan'
            binary=False,
            batch_size=500,
            teacher_iters=5,
            student_iters=5,
            sample_per_teacher=1000,
            epsilon=8.0,
            delta=1e-5,
            noise_multiplier=1e-3,
            moments_order=100):

        # CTGAN model specifi3c parameters
        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim
        self.l2scale = l2scale
        self.batch_size = batch_size
        self.epochs = epochs
        self.pack = pack
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.loss = loss
        self.regularization = regularization
        self.sample_per_teacher = sample_per_teacher
        self.noise_multiplier = noise_multiplier
        self.moments_order = moments_order

        self.binary = binary
        self.batch_size = batch_size
        self.teacher_iters = teacher_iters
        self.student_iters = student_iters
        self.epsilon = epsilon
        self.delta = delta
        self.pd_cols = None
        self.pd_index = None

    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None,
              verbose=False,
              mlflow=False):
        if update_epsilon:
            self.epsilon = update_epsilon

        if not hasattr(self, "transformer"):
            self.transformer = DataTransformer()
            self.transformer.fit(data, discrete_columns=categorical_columns)

        data = self.transformer.transform(data)

        data_dim = self.transformer.output_dimensions

        if not hasattr(self, "cond_generator"):
            self.cond_generator = ConditionalGenerator(
                data, self.transformer.output_info, self.log_frequency)

        if not hasattr(self, "generator"):
            self.generator = Generator(
                self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
                data_dim).to(self.device)

        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                data_dim + self.cond_generator.n_opt, self.dis_dim, self.loss,
                self.pack).to(self.device)

        if not hasattr(self, "student_disc"):
            self.student_disc = self.discriminator
            self.student_disc.apply(weights_init)

        if not hasattr(self, "teacher_disc"):
            # this is to make sure each teacher has some samples
            sample_per_teacher = self.sample_per_teacher if self.sample_per_teacher < len(
                data) else 1000
            self.num_teachers = int(len(data) / sample_per_teacher) + 1
            self.data_partitions = np.array_split(data, self.num_teachers)
            # create conditional generator for each teacher model
            self.cond_generator_t = [
                ConditionalGenerator(d, self.transformer.output_info,
                                     self.log_frequency)
                for d in self.data_partitions
            ]
            self.teacher_disc = [
                self.discriminator for i in range(self.num_teachers)
            ]
            for i in range(self.num_teachers):
                self.teacher_disc[i].apply(weights_init)

        if not hasattr(self, "optimizerG"):
            self.optimizerG = optim.Adam(self.generator.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9),
                                         weight_decay=self.l2scale)
        if not hasattr(self, "optimizerS"):
            self.optimizerS = optim.Adam(self.student_disc.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9))
        if not hasattr(self, "optimizerT"):
            self.optimizerT = [
                optim.Adam(self.teacher_disc[i].parameters(),
                           lr=2e-4,
                           betas=(0.5, 0.9)) for i in range(self.num_teachers)
            ]

        if not hasattr(self, "train_eps"):
            self.alphas = torch.tensor(
                [0.0 for i in range(self.moments_order)], device=self.device)
            self.l_list = 1 + torch.tensor(range(self.moments_order),
                                           device=self.device)
            self.train_eps = 0

        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1
        one = torch.tensor(1, dtype=torch.float).to(self.device)
        mone = one * -1

        REAL_LABEL = 1
        FAKE_LABEL = 0

        criterion = nn.BCELoss() if (self.loss
                                     == "cross_entropy") else self.WLoss

        if (verbose):
            print("using loss {} and regularization {}".format(
                self.loss, self.regularization))

        epoch = 0
        while self.train_eps < self.epsilon:
            # train teacher discriminators
            for t_2 in range(self.teacher_iters):
                for i in range(self.num_teachers):

                    partition_data = self.data_partitions[i]
                    data_sampler = Sampler(partition_data,
                                           self.transformer.output_info)
                    fakez = torch.normal(mean, std=std).to(self.device)

                    condvec = self.cond_generator_t[i].sample(self.batch_size)

                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = data_sampler.sample(self.batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self.device)
                        m1 = torch.from_numpy(m1).to(self.device)
                        fakez = torch.cat([fakez, c1], dim=1)
                        perm = np.arange(self.batch_size)
                        np.random.shuffle(perm)
                        real = data_sampler.sample(self.batch_size, col[perm],
                                                   opt[perm])
                        c2 = c1[perm]

                    fake = self.generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype('float32')).to(
                        self.device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fake

                    self.optimizerT[i].zero_grad()

                    y_all = torch.cat([
                        self.teacher_disc[i](fake_cat),
                        self.teacher_disc[i](real_cat)
                    ])
                    label_fake = torch.full(
                        (int(self.batch_size / self.pack), 1),
                        FAKE_LABEL,
                        dtype=torch.float,
                        device=self.device)
                    label_true = torch.full(
                        (int(self.batch_size / self.pack), 1),
                        REAL_LABEL,
                        dtype=torch.float,
                        device=self.device)
                    labels = torch.cat([label_fake, label_true])

                    errD = criterion(y_all, labels)
                    errD.backward()

                    if (self.regularization == 'dragan'):
                        pen = self.teacher_disc[i].dragan_penalty(
                            real_cat, device=self.device)
                        pen.backward(retain_graph=True)

                    self.optimizerT[i].step()

            # train student discriminator
            for t_3 in range(self.student_iters):
                data_sampler = Sampler(data, self.transformer.output_info)
                fakez = torch.normal(mean, std=std).to(self.device)

                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                else:
                    fake_cat = fake

                fake_data = fake_cat
                predictions, votes = pate(fake_data, self.teacher_disc,
                                          self.noise_multiplier, self.device)

                output = self.student_disc(fake_data.detach())

                # update moments accountant
                self.alphas = self.alphas + moments_acc(
                    self.num_teachers, votes, self.noise_multiplier,
                    self.l_list, self.device)

                loss_s = criterion(output, predictions.float().to(self.device))

                self.optimizerS.zero_grad()
                loss_s.backward()

                if (self.regularization == 'dragan'):
                    vals = torch.cat(
                        [predictions.float().to(self.device), fake_data],
                        axis=1)
                    ordered = vals[vals[:, 0].sort()[1]]
                    data_list = torch.split(
                        ordered,
                        predictions.shape[0] - int(predictions.sum().item()))
                    synth_cat = torch.cat(data_list[1:], axis=0)[:, 1:]
                    pen = self.student_disc.dragan_penalty(synth_cat,
                                                           device=self.device)
                    pen.backward(retain_graph=True)

                self.optimizerS.step()

            self.train_eps = min(
                (self.alphas - math.log(self.delta)) / self.l_list)

            #train generator
            fakez = torch.normal(mean=mean, std=std).to(self.device)
            condvec = self.cond_generator.sample(self.batch_size)

            if condvec is None:
                c1, m1, col, opt = None, None, None, None
            else:
                c1, m1, col, opt = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                m1 = torch.from_numpy(m1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)

            if c1 is not None:
                y_fake = self.student_disc(torch.cat([fakeact, c1], dim=1))
            else:
                y_fake = self.student_disc(fakeact)

            if condvec is None:
                cross_entropy = 0
            else:
                cross_entropy = self._cond_loss(fake, c1, m1)

            if self.loss == 'cross_entropy':
                label_g = torch.full((int(self.batch_size / self.pack), 1),
                                     REAL_LABEL,
                                     dtype=torch.float,
                                     device=self.device)
                loss_g = criterion(y_fake, label_g.float())
                loss_g = loss_g + cross_entropy
            else:
                loss_g = -torch.mean(y_fake) + cross_entropy

            self.optimizerG.zero_grad()
            loss_g.backward()
            self.optimizerG.step()

            if (verbose):
                print('eps: {:f} \t G: {:f} \t D: {:f}'.format(
                    self.train_eps,
                    loss_g.detach().cpu(),
                    loss_s.detach().cpu()))

            if (mlflow):
                import mlflow
                mlflow.log_metric("loss_g",
                                  float(loss_g.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("loss_d",
                                  float(loss_s.detach().cpu()),
                                  step=epoch)
                mlflow.log_metric("epsilon", float(self.train_eps), step=epoch)

            epoch += 1

    def WLoss(self, output, labels):
        vals = torch.cat([labels, output], axis=1)
        ordered = vals[vals[:, 0].sort()[1]]
        data_list = torch.split(ordered,
                                labels.shape[0] - int(labels.sum().item()))
        fake_score = data_list[0][:, 1]
        true_score = torch.cat(data_list[1:], axis=0)[:, 1]
        w_loss = -(torch.mean(true_score) - torch.mean(fake_score))
        return w_loss

    def generate(self, n):
        self.generator.eval()

        steps = n // self.batch_size + 1

        data = []

        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        generated_data = self.transformer.inverse_transform(data, None)

        return generated_data

    def save(self, path):
        assert hasattr(self, "generator")
        assert hasattr(self, "teacher_disc")
        assert hasattr(self, "student_disc")
        assert hasattr(self, "cond_generator")
        assert hasattr(self, "cond_generator_t")

        # always save a cpu model.
        device_bak = self.device
        self.device = torch.device("cpu")
        self.generator.to(self.device)
        for i in range(self.num_teachers):
            self.teacher_disc[i].to(self.device)
        self.student_disc.to(self.device)

        torch.save(self, path)

        self.device = device_bak
        self.generator.to(self.device)
        for i in range(self.num_teachers):
            self.teacher_disc[i].to(self.device)
        self.student_disc.to(self.device)

    @classmethod
    def load(cls, path):
        model = torch.load(path)
        model.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        model.generator.to(model.device)
        model.student_disc.to(model.device)
        for i in range(model.num_teachers):
            model.teacher_disc[i].to(model.device)
        return model
Ejemplo n.º 8
0
class CTGANSynthesizer(object):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.

    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        gen_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        dis_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        l2scale (float):
            Weight Decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        blackbox_model:
            Model that implements fit, predict, predict_proba
    """
    def __init__(
        self,
        embedding_dim=128,
        gen_dim=(256, 256),
        dis_dim=(256, 256),
        l2scale=1e-6,
        batch_size=500,
        discriminator_steps=1,
        log_frequency=True,
        blackbox_model=None,
        preprocessing_pipeline=None,
        bb_loss="logloss",
    ):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.trained_epoches = 0
        self.discriminator_steps = discriminator_steps
        self.blackbox_model = blackbox_model
        self.preprocessing_pipeline = preprocessing_pipeline
        self.confidence_level = -1  # will set in fit
        self.bb_loss = bb_loss

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.

        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing

        Args:
            logits:
                […, num_features] unnormalized log probabilities
            tau:
                non-negative scalar temperature
            hard:
                if True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                a dimension along which softmax will be computed. Default: -1.

        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """

        if version.parse(torch.__version__) < version.parse("1.2.0"):
            for i in range(10):
                transformed = functional.gumbel_softmax(logits,
                                                        tau=tau,
                                                        hard=hard,
                                                        eps=eps,
                                                        dim=dim)
                if not torch.isnan(transformed).any():
                    return transformed
            raise ValueError("gumbel_softmax returning NaN.")

        return functional.gumbel_softmax(logits,
                                         tau=tau,
                                         hard=hard,
                                         eps=eps,
                                         dim=dim)

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == "tanh":
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == "softmax":
                ed = st + item[0]
                transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                data_t.append(transformed)
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == "tanh":
                st += item[0]
                skip = True

            elif item[1] == "softmax":
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(
                    data[:, st:ed],
                    torch.argmax(c[:, st_c:ed_c], dim=1),
                    reduction="none",
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(
        self,
        train_data,
        discrete_columns=tuple(),
        epochs=300,
        confidence_level=-1,
        verbose=True,
        gen_lr=2e-4,
    ):
        """Fit the CTGAN Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a
                pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
            epochs (int):
                Number of training epochs. Defaults to 300.
        """
        self.confidence_level = confidence_level
        loss_other_name = "loss_bb" if confidence_level != -1 else "loss_d"
        history = {"loss_g": [], loss_other_name: []}

        # Eli: add Mode-specific Normalization

        if not hasattr(self, "transformer"):
            self.transformer = DataTransformer()
            self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions

        if not hasattr(self, "cond_generator"):
            self.cond_generator = ConditionalGenerator(
                train_data, self.transformer.output_info, self.log_frequency)

        if not hasattr(self, "generator"):
            self.generator = Generator(
                self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
                data_dim).to(self.device)

        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                data_dim + self.cond_generator.n_opt,
                self.dis_dim).to(self.device)

        if not hasattr(self, "optimizerG"):
            self.optimizerG = optim.Adam(
                self.generator.parameters(),
                lr=gen_lr,
                betas=(0.5, 0.9),
                weight_decay=self.l2scale,
            )

        if not hasattr(self, "optimizerD"):
            self.optimizerD = optim.Adam(self.discriminator.parameters(),
                                         lr=2e-4,
                                         betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0

        # init mean to zero and std to one
        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        # steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        steps_per_epoch = 10  # magic number decided with Gilad. feel free to change it

        # Eli: start training loop
        for i in range(epochs):
            self.trained_epoches += 1
            for id_ in range(steps_per_epoch):

                if self.confidence_level == -1:
                    # discriminator loop
                    for n in range(self.discriminator_steps):
                        fakez = torch.normal(mean=mean, std=std)

                        condvec = self.cond_generator.sample(self.batch_size)
                        if condvec is None:
                            c1, m1, col, opt = None, None, None, None
                            real = data_sampler.sample(self.batch_size, col,
                                                       opt)
                        else:
                            c1, m1, col, opt = condvec
                            c1 = torch.from_numpy(c1).to(self.device)
                            m1 = torch.from_numpy(m1).to(self.device)
                            fakez = torch.cat([fakez, c1], dim=1)

                            perm = np.arange(self.batch_size)
                            np.random.shuffle(perm)
                            real = data_sampler.sample(self.batch_size,
                                                       col[perm], opt[perm])
                            c2 = c1[perm]

                        fake = self.generator(fakez)
                        fakeact = self._apply_activate(fake)

                        real = torch.from_numpy(real.astype("float32")).to(
                            self.device)

                        if c1 is not None:
                            fake_cat = torch.cat([fakeact, c1], dim=1)
                            real_cat = torch.cat([real, c2], dim=1)
                        else:
                            real_cat = real
                            fake_cat = fake

                        y_fake = self.discriminator(fake_cat)
                        y_real = self.discriminator(real_cat)

                        pen = self.discriminator.calc_gradient_penalty(
                            real_cat, fake_cat, self.device)
                        loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                        if self.confidence_level == -1:  # without bb loss
                            self.optimizerD.zero_grad()
                            pen.backward(retain_graph=True)
                            loss_d.backward()
                            self.optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = self.discriminator(torch.cat([fakeact, c1],
                                                          dim=1))
                else:
                    y_fake = self.discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                if self.confidence_level != -1:
                    # generate `batch_size` samples
                    gen_out = self.sample(self.batch_size)
                    loss_bb = self._calc_bb_confidence_loss(gen_out)
                    loss_g = loss_bb + cross_entropy
                else:  # original loss
                    loss_g = -torch.mean(y_fake) + cross_entropy

                self.optimizerG.zero_grad()
                loss_g.backward()
                self.optimizerG.step()

            loss_g_val = loss_g.detach().cpu()
            loss_other_val = locals()[loss_other_name].detach().cpu()
            history["loss_g"].append(loss_g.item())
            history[loss_other_name].append(loss_other_val.item())

            if verbose:
                print(
                    f"Epoch {self.trained_epoches}, Loss G: {loss_g_val}, {loss_other_name}: {loss_other_val}",
                    flush=True,
                )

        return history

    def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.

        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """

        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = (
                self.cond_generator.generate_cond_from_condition_column_info(
                    condition_info, self.batch_size))
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]
        # return data
        return self.transformer.inverse_transform(data, None)

    def save(self, path):
        assert hasattr(self, "generator")
        assert hasattr(self, "discriminator")
        assert hasattr(self, "transformer")

        # always save a cpu model.
        device_bak = self.device
        self.device = torch.device("cpu")
        self.generator.to(self.device)
        self.discriminator.to(self.device)

        torch.save(self, path)

        self.device = device_bak
        self.generator.to(self.device)
        self.discriminator.to(self.device)

    @classmethod
    def load(cls, path):
        model = torch.load(path)
        model.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        model.generator.to(model.device)
        model.discriminator.to(model.device)
        return model

    def _calc_bb_confidence_loss(self, gen_out):

        y_prob = self.blackbox_model.predict_proba(gen_out)
        y_conf_gen = y_prob[:, 0]  # confidence scores

        # create vector with the same size of y_confidence filled with `confidence_level` values
        if isinstance(self.confidence_level, list):
            conf = np.random.choice(self.confidence_level)
        else:
            conf = self.confidence_level
        y_conf_wanted = np.full(len(y_conf_gen), conf)

        # to tensor
        y_conf_gen = torch.tensor(y_conf_gen,
                                  requires_grad=True).to(self.device)
        y_conf_wanted = torch.tensor(y_conf_wanted).to(self.device)

        # loss
        bb_loss = self._get_loss_by_name(self.bb_loss)
        bb_loss_val = bb_loss(y_conf_gen, y_conf_wanted)

        return bb_loss_val

    @staticmethod
    def _get_loss_by_name(loss_name):
        if loss_name == "log":
            return torch.nn.BCELoss()
        elif loss_name == "l1":
            return torch.nn.L1Loss()
        elif loss_name == "l2":
            return torch.nn.L1Loss()
        elif loss_name == "focal":
            return WeightedFocalLoss()
        else:
            raise ValueError(f"Unknown loss name '{loss_name}'")
Ejemplo n.º 9
0
class PATECTGAN(CTGANSynthesizer):
    def __init__(
        self,
        embedding_dim=128,
        gen_dim=(256, 256),
        dis_dim=(256, 256),
        l2scale=1e-6,
        epochs=300,
        pack=1,
        log_frequency=True,
        disabled_dp=False,
        target_delta=None,
        sigma=5,
        max_per_sample_grad_norm=1.0,
        verbose=False,
        loss="cross_entropy",  # losses supported: 'cross_entropy', 'wasserstein'
        regularization=None,  # regularizations supported: 'dragan'
        binary=False,
        batch_size=500,
        teacher_iters=5,
        student_iters=5,
        sample_per_teacher=1000,
        epsilon=8.0,
        delta=1e-5,
        noise_multiplier=1e-3,
        moments_order=100,
    ):

        # CTGAN model specifi3c parameters
        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim
        self.l2scale = l2scale
        self.batch_size = batch_size
        self.epochs = epochs
        self.pack = pack
        self.log_frequency = log_frequency
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.verbose = verbose
        self.loss = loss
        self.regularization = regularization if self.loss != "wasserstein" else "dragan"
        self.sample_per_teacher = sample_per_teacher
        self.noise_multiplier = noise_multiplier
        self.moments_order = moments_order

        self.binary = binary
        self.batch_size = batch_size
        self.teacher_iters = teacher_iters
        self.student_iters = student_iters
        self.epsilon = epsilon
        self.delta = delta
        self.pd_cols = None
        self.pd_index = None

    def train(self,
              data,
              categorical_columns=None,
              ordinal_columns=None,
              update_epsilon=None):
        if update_epsilon:
            self.epsilon = update_epsilon

        sample_per_teacher = (self.sample_per_teacher
                              if self.sample_per_teacher < len(data) else 1000)
        self.num_teachers = int(len(data) / sample_per_teacher) + 1
        self.transformer = DataTransformer()
        self.transformer.fit(data, discrete_columns=categorical_columns)
        data = self.transformer.transform(data)
        data_partitions = np.array_split(data, self.num_teachers)

        data_dim = self.transformer.output_dimensions

        self.cond_generator = ConditionalGenerator(
            data, self.transformer.output_info, self.log_frequency)

        # create conditional generator for each teacher model
        cond_generator = [
            ConditionalGenerator(d, self.transformer.output_info,
                                 self.log_frequency) for d in data_partitions
        ]

        self.generator = Generator(
            self.embedding_dim + self.cond_generator.n_opt, self.gen_dim,
            data_dim).to(self.device)

        discriminator = Discriminator(data_dim + self.cond_generator.n_opt,
                                      self.dis_dim, self.loss,
                                      self.pack).to(self.device)

        student_disc = discriminator
        student_disc.apply(weights_init)

        teacher_disc = [discriminator for i in range(self.num_teachers)]
        for i in range(self.num_teachers):
            teacher_disc[i].apply(weights_init)

        optimizer_g = optim.Adam(self.generator.parameters(),
                                 lr=2e-4,
                                 betas=(0.5, 0.9),
                                 weight_decay=self.l2scale)
        optimizer_s = optim.Adam(student_disc.parameters(),
                                 lr=2e-4,
                                 betas=(0.5, 0.9))
        optimizer_t = [
            optim.Adam(teacher_disc[i].parameters(), lr=2e-4, betas=(0.5, 0.9))
            for i in range(self.num_teachers)
        ]

        noise_multiplier = self.noise_multiplier
        alphas = torch.tensor([0.0 for i in range(self.moments_order)],
                              device=self.device)
        l_list = 1 + torch.tensor(range(self.moments_order),
                                  device=self.device)
        eps = 0

        mean = torch.zeros(self.batch_size,
                           self.embedding_dim,
                           device=self.device)
        std = mean + 1

        real_label = 1
        fake_label = 0

        criterion = nn.BCELoss() if (self.loss
                                     == "cross_entropy") else self.w_loss

        if self.verbose:
            print("using loss {} and regularization {}".format(
                self.loss, self.regularization))

        while eps < self.epsilon:
            # train teacher discriminators
            for t_2 in range(self.teacher_iters):
                for i in range(self.num_teachers):
                    partition_data = data_partitions[i]
                    data_sampler = Sampler(partition_data,
                                           self.transformer.output_info)
                    fakez = torch.normal(mean, std=std).to(self.device)

                    condvec = cond_generator[i].sample(self.batch_size)

                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = data_sampler.sample(self.batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self.device)
                        m1 = torch.from_numpy(m1).to(self.device)
                        fakez = torch.cat([fakez, c1], dim=1)
                        perm = np.arange(self.batch_size)
                        np.random.shuffle(perm)
                        real = data_sampler.sample(self.batch_size, col[perm],
                                                   opt[perm])
                        c2 = c1[perm]

                    fake = self.generator(fakez)
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype("float32")).to(
                        self.device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fake

                    optimizer_t[i].zero_grad()

                    y_all = torch.cat(
                        [teacher_disc[i](fake_cat), teacher_disc[i](real_cat)])
                    label_fake = torch.full(
                        (int(self.batch_size / self.pack), 1),
                        fake_label,
                        dtype=torch.float,
                        device=self.device,
                    )
                    label_true = torch.full(
                        (int(self.batch_size / self.pack), 1),
                        real_label,
                        dtype=torch.float,
                        device=self.device,
                    )
                    labels = torch.cat([label_fake, label_true])

                    error_d = criterion(y_all, labels)
                    error_d.backward()

                    if self.regularization == "dragan":
                        pen = teacher_disc[i].dragan_penalty(
                            real_cat, device=self.device)
                        pen.backward(retain_graph=True)

                    optimizer_t[i].step()

            # train student discriminator
            for t_3 in range(self.student_iters):
                data_sampler = Sampler(data, self.transformer.output_info)
                fakez = torch.normal(mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm],
                                               opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                else:
                    fake_cat = fake

                fake_data = fake_cat
                predictions, votes = pate(fake_data,
                                          teacher_disc,
                                          noise_multiplier,
                                          device=self.device)

                output = student_disc(fake_data.detach())

                # update moments accountant
                alphas = alphas + moments_acc(self.num_teachers,
                                              votes,
                                              noise_multiplier,
                                              l_list,
                                              device=self.device)

                loss_s = criterion(output, predictions.float().to(self.device))

                optimizer_s.zero_grad()
                loss_s.backward()

                if self.regularization == "dragan":
                    vals = torch.cat([predictions, fake_data], axis=1)
                    ordered = vals[vals[:, 0].sort()[1]]
                    data_list = torch.split(
                        ordered,
                        predictions.shape[0] - int(predictions.sum().item()))
                    synth_cat = torch.cat(data_list[1:], axis=0)[:, 1:]
                    pen = student_disc.dragan_penalty(synth_cat,
                                                      device=self.device)
                    pen.backward(retain_graph=True)

                optimizer_s.step()

            # print ('iterator {i}, student discriminator loss is {j}'.format(i=t_3, j=loss_s))

            # train generator
            fakez = torch.normal(mean=mean, std=std)
            condvec = self.cond_generator.sample(self.batch_size)

            if condvec is None:
                c1, m1, col, opt = None, None, None, None
            else:
                c1, m1, col, opt = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                m1 = torch.from_numpy(m1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)

            if c1 is not None:
                y_fake = student_disc(torch.cat([fakeact, c1], dim=1))
            else:
                y_fake = student_disc(fakeact)

            if condvec is None:
                cross_entropy = 0
            else:
                cross_entropy = self._cond_loss(fake, c1, m1)

            if self.loss == "cross_entropy":
                label_g = torch.full(
                    (int(self.batch_size / self.pack), 1),
                    real_label,
                    dtype=torch.float,
                    device=self.device,
                )
                loss_g = criterion(y_fake, label_g.float())
                loss_g = loss_g + cross_entropy
            else:
                loss_g = -torch.mean(y_fake) + cross_entropy

            optimizer_g.zero_grad()
            loss_g.backward()
            optimizer_g.step()

            eps = min((alphas - math.log(self.delta)) / l_list)

            if self.verbose:
                print("eps: {:f} \t G: {:f} \t D: {:f}".format(
                    eps,
                    loss_g.detach().cpu(),
                    loss_s.detach().cpu()))

    def w_loss(self, output, labels):
        vals = torch.cat([labels, output], axis=1)
        ordered = vals[vals[:, 0].sort()[1]]
        data_list = torch.split(ordered,
                                labels.shape[0] - int(labels.sum().item()))
        fake_score = data_list[0][:, 1]
        true_score = torch.cat(data_list[1:], axis=0)[:, 1]
        w_loss = -(torch.mean(true_score) - torch.mean(fake_score))
        return w_loss

    def generate(self, n):
        self.generator.eval()

        steps = n // self.batch_size + 1

        data = []

        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            condvec = self.cond_generator.sample_zero(self.batch_size)
            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        generated_data = self.transformer.inverse_transform(data, None)

        return generated_data