Beispiel #1
0
    def prepare_sampling(self):
        """Prepare model for generate samples."""
        if self.model is None:
            self.model = self.get_model(training=False)
        else:
            self.model.training = False

        predict_config = PredictConfig(
            session_init=SaverRestore(self.restore_path),
            model=self.model,
            input_names=['z'],
            output_names=['gen/gen', 'z'],
        )

        self.simple_dataset_predictor = SimpleDatasetPredictor(
            predict_config, RandomZData((self.batch_size, self.z_dim)))
Beispiel #2
0
def get_model(model, ckpt_name, option):
    model_path = ospj('train_log', option.log_dir, ckpt_name)
    ds = get_data('val', option)
    pred_config = PredictConfig(
        model=model,
        session_init=get_model_loader(model_path),
        input_names=['input', 'label', 'bbox'],
        output_names=['wrong-top1', 'top5', 'actmap', 'grad'],
        return_input=True)

    return SimpleDatasetPredictor(pred_config, ds)
Beispiel #3
0
class TGANModel:
    """Main model from TGAN.

    Args:
        continuous_columns (list[int]): 0-index list of column indices to be considered continuous.
        output (str, optional): Path to store the model and its artifacts. Defaults to
            :attr:`output`.
        gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`.
        max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`.
        steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to
            :attr:`10000`.
        save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after
            each training epoch. Defaults to :attr:`True`
        restore_session(bool, optional): Whether or not continue training from the last checkpoint.
            Defaults to :attr:`True`.
        batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to
            :attr:`200`.
        z_dim (int, optional): Number of dimensions in the noise input for the generator.
            Defaults to :attr:`100`.
        noise (float, optional): Upper bound to the gaussian noise added to categorical columns.
            Defaults to :attr:`0.2`.
        l2norm (float, optional):
            L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`.
        learning_rate (float, optional): Learning rate for the optimizer. Defaults to
            :attr:`0.001`.
        num_gen_rnn (int, optional): Defaults to :attr:`400`.
        num_gen_feature (int, optional): Number of features of in the generator. Defaults to
            :attr:`100`
        num_dis_layers (int, optional): Defaults to :attr:`2`.
        num_dis_hidden (int, optional): Defaults to :attr:`200`.
        optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are:
            [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to
            :attr:`AdamOptimizer`.
    """
    def __init__(self,
                 continuous_columns,
                 sensitive_column,
                 output='output',
                 gpu=None,
                 max_epoch=5,
                 steps_per_epoch=10000,
                 save_checkpoints=True,
                 restore_session=True,
                 batch_size=200,
                 z_dim=200,
                 noise=0.2,
                 l2norm=0.00001,
                 discrim_learning_rate=0.001,
                 fair_learning_rate=0.0002,
                 num_gen_rnn=100,
                 num_gen_feature=100,
                 num_dis_layers=1,
                 num_dis_hidden=100,
                 optimizer='AdamOptimizer',
                 trainer='GANTrainer'):
        """Initialize object."""
        # Output
        self.continuous_columns = continuous_columns
        self.sensitive_column = sensitive_column
        self.log_dir = os.path.join(output, 'logs')
        self.model_dir = os.path.join(output, 'model')
        self.output = output

        # Training params
        self.max_epoch = max_epoch
        self.steps_per_epoch = steps_per_epoch
        self.save_checkpoints = save_checkpoints
        self.restore_session = restore_session

        # Model params
        self.model = None
        self.batch_size = batch_size
        self.z_dim = z_dim
        self.noise = noise
        self.l2norm = l2norm
        self.discrim_learning_rate = discrim_learning_rate
        self.fair_learning_rate = fair_learning_rate
        self.num_gen_rnn = num_gen_rnn
        self.num_gen_feature = num_gen_feature
        self.num_dis_layers = num_dis_layers
        self.num_dis_hidden = num_dis_hidden
        self.optimizer = optimizer
        self.trainer = trainer

        if gpu:
            os.environ['CUDA_VISIBLE_DEVICES'] = gpu

        self.gpu = gpu

    def get_model(self, training=True):
        """Return a new instance of the model."""
        return GraphBuilder(metadata=self.metadata,
                            sensitive_column=self.sensitive_column,
                            batch_size=self.batch_size,
                            z_dim=self.z_dim,
                            noise=self.noise,
                            l2norm=self.l2norm,
                            discrim_learning_rate=self.discrim_learning_rate,
                            fair_learning_rate=self.fair_learning_rate,
                            num_gen_rnn=self.num_gen_rnn,
                            num_gen_feature=self.num_gen_feature,
                            num_dis_layers=self.num_dis_layers,
                            num_dis_hidden=self.num_dis_hidden,
                            optimizer=self.optimizer,
                            training=training)

    def prepare_sampling(self):
        """Prepare model for generate samples."""
        if self.model is None:
            self.model = self.get_model(training=False)

        else:
            self.model.training = False

        predict_config = PredictConfig(
            session_init=SaverRestore(self.restore_path),
            model=self.model,
            input_names=['z'],
            output_names=['gen/gen', 'z'],
        )

        self.simple_dataset_predictor = SimpleDatasetPredictor(
            predict_config, RandomZData((self.batch_size, self.z_dim)))

    def fit(self, data):
        """Fit the model to the given data.

        Args:
            data(pandas.DataFrame): dataset to fit the model.

        Returns:
            None

        """
        self.preprocessor = Preprocessor(
            continuous_columns=self.continuous_columns)

        data = self.preprocessor.fit_transform(data)
        self.metadata = self.preprocessor.metadata
        dataflow = TGANDataFlow(data, self.metadata)
        batch_data = BatchData(dataflow, self.batch_size)
        input_queue = QueueInput(batch_data)

        self.model = self.get_model(training=True)

        if self.trainer == 'GANTrainer':
            trainer = GANTrainer(model=self.model, input_queue=input_queue)
        elif self.trainer == 'SeparateGANTrainer':
            trainer = SeparateGANTrainer(model=self.model,
                                         input_queue=input_queue)
        else:
            raise ValueError(
                'Incorrect trainer name. Use GANTrainer or SeparateGANTrainer')

        # trainer = SeparateGANTrainer(model=self.model, input_queue=input_queue)

        self.restore_path = os.path.join(self.model_dir, 'checkpoint')

        if os.path.isfile(self.restore_path) and self.restore_session:
            session_init = SaverRestore(self.restore_path)
            with open(os.path.join(self.log_dir, 'stats.json')) as f:
                starting_epoch = json.load(f)[-1]['epoch_num'] + 1
        else:
            session_init = None
            starting_epoch = 1

        action = 'k' if self.restore_session else None
        logger.set_logger_dir(self.log_dir, action=action)

        callbacks = []
        if self.save_checkpoints:
            callbacks.append(ModelSaver(checkpoint_dir=self.model_dir))

        trainer.train_with_defaults(callbacks=callbacks,
                                    steps_per_epoch=self.steps_per_epoch,
                                    max_epoch=self.max_epoch,
                                    session_init=session_init,
                                    starting_epoch=starting_epoch)

        self.prepare_sampling()

    def sample(self, num_samples):
        """Generate samples from model.

        Args:
            num_samples(int)

        Returns:
            None

        Raises:
            ValueError

        """
        max_iters = (num_samples // self.batch_size)

        results = []
        for idx, o in enumerate(self.simple_dataset_predictor.get_result()):
            results.append(o[0])
            if idx + 1 == max_iters:
                break

        results = np.concatenate(results, axis=0)

        ptr = 0
        features = {}
        for col_id, col_info in enumerate(self.metadata['details']):
            if col_info['type'] == 'category':
                features['f%02d' % col_id] = results[:, ptr:ptr + 1]
                ptr += 1

            elif col_info['type'] == 'value':
                gaussian_components = col_info['n']
                val = results[:, ptr:ptr + 1]
                ptr += 1
                pro = results[:, ptr:ptr + gaussian_components]
                ptr += gaussian_components
                features['f%02d' % col_id] = np.concatenate([val, pro], axis=1)

            else:
                raise ValueError(
                    "self.metadata['details'][{}]['type'] must be either `category` or "
                    "`values`. Instead it was {}.".format(
                        col_id, col_info['type']))

        return self.preprocessor.reverse_transform(
            features)[:num_samples].copy()

    def tar_folder(self, tar_name):
        """Generate a tar of :self.output:."""
        with tarfile.open(tar_name, 'w:gz') as tar_handle:
            for root, dirs, files in os.walk(self.output):
                for file_ in files:
                    tar_handle.add(os.path.join(root, file_))

            tar_handle.close()

    @classmethod
    def load(cls, path):
        """Load a pretrained model from a given path."""
        with tarfile.open(path, 'r:gz') as tar_handle:
            destination_dir = os.path.dirname(tar_handle.getmembers()[0].name)
            tar_handle.extractall()

        with open('{}/TGANModel'.format(destination_dir), 'rb') as f:
            instance = pickle.load(f)

        instance.prepare_sampling()
        return instance

    def save(self, path, force=False):
        """Save the fitted model in the given path."""
        if os.path.exists(path) and not force:
            logger.info(
                'The indicated path already exists. Use `force=True` to overwrite.'
            )
            return

        base_path = os.path.dirname(path)
        if not os.path.exists(base_path):
            os.makedirs(base_path)

        model = self.model
        dataset_predictor = self.simple_dataset_predictor

        self.model = None
        self.simple_dataset_predictor = None

        with open('{}/TGANModel'.format(self.output), 'wb') as f:
            pickle.dump(self, f)

        self.model = model
        self.simple_dataset_predictor = dataset_predictor

        self.tar_folder(path)

        logger.info('Model saved successfully.')