def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) from tensorpack.callbacks import CometMLMonitor trainer = SeparateGANTrainer( model=self.model, input_queue=input_queue, g_period=6, ) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else 'd' # logger.set_logger_dir(self.log_dir, action=action) callbacks = [] monitors = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) callbacks.append(MergeAllSummaries(period=10)) if self.experiment is not None: monitors.append(CometMLMonitor(experiment=self.experiment)) trainer.train_with_defaults(callbacks=callbacks, monitors=monitors, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) trainer = self.get_trainer( model=self.model, input_queue=input_queue, ) self.restore_path = os.path.join(self.model_dir, "checkpoint") if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, "stats.json")) as f: starting_epoch = json.load(f)[-1]["epoch_num"] + 1 else: session_init = None starting_epoch = 1 action = "k" if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling()
class TGANModel: """Main model from TGAN. Args: continuous_columns (list[int]): 0-index list of column indices to be considered continuous. output (str, optional): Path to store the model and its artifacts. Defaults to :attr:`output`. gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`. max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`. steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to :attr:`10000`. save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after each training epoch. Defaults to :attr:`True` restore_session(bool, optional): Whether or not continue training from the last checkpoint. Defaults to :attr:`True`. batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to :attr:`200`. z_dim (int, optional): Number of dimensions in the noise input for the generator. Defaults to :attr:`100`. noise (float, optional): Upper bound to the gaussian noise added to categorical columns. Defaults to :attr:`0.2`. l2norm (float, optional): L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`. learning_rate (float, optional): Learning rate for the optimizer. Defaults to :attr:`0.001`. num_gen_rnn (int, optional): Defaults to :attr:`400`. num_gen_feature (int, optional): Number of features of in the generator. Defaults to :attr:`100` num_dis_layers (int, optional): Defaults to :attr:`2`. num_dis_hidden (int, optional): Defaults to :attr:`200`. optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are: [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to :attr:`AdamOptimizer`. """ def __init__(self, continuous_columns, output='output', gpu=None, max_epoch=5, steps_per_epoch=10000, save_checkpoints=True, restore_session=True, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, optimizer='AdamOptimizer', comet_ml_key=None, experiment=None, ds=None): """Initialize object.""" # Output self.continuous_columns = continuous_columns self.log_dir = os.path.join(output, 'logs') self.model_dir = os.path.join(output, 'model') self.output = output # Training params self.max_epoch = max_epoch self.steps_per_epoch = steps_per_epoch self.save_checkpoints = save_checkpoints self.restore_session = restore_session # Model params self.model = None self.batch_size = batch_size self.z_dim = z_dim self.noise = noise self.l2norm = l2norm self.learning_rate = learning_rate self.num_gen_rnn = num_gen_rnn self.num_gen_feature = num_gen_feature self.num_dis_layers = num_dis_layers self.num_dis_hidden = num_dis_hidden self.optimizer = optimizer if gpu: os.environ['CUDA_VISIBLE_DEVICES'] = gpu if experiment is not None: self.experiment = experiment elif comet_ml_key is not None: self.comet_ml_key = comet_ml_key self.experiment = Experiment(api_key=comet_ml_key, project_name='tgan-wgan-gp', workspace="baukebrenninkmeijer") if ds is not None: experiment.log_dataset_info(name=ds) self.gpu = gpu def get_model(self, training=True): """Return a new instance of the model.""" return GraphBuilder(metadata=self.metadata, batch_size=self.batch_size, z_dim=self.z_dim, noise=self.noise, l2norm=self.l2norm, learning_rate=self.learning_rate, num_gen_rnn=self.num_gen_rnn, num_gen_feature=self.num_gen_feature, num_dis_layers=self.num_dis_layers, num_dis_hidden=self.num_dis_hidden, optimizer=self.optimizer, training=training) def prepare_sampling(self): """Prepare model for generate samples.""" if self.model is None: self.model = self.get_model(training=False) else: self.model.training = False predict_config = PredictConfig( session_init=SaverRestore(self.restore_path), model=self.model, input_names=['z'], output_names=['gen/gen', 'z'], ) self.simple_dataset_predictor = SimpleDatasetPredictor( predict_config, RandomZData((self.batch_size, self.z_dim))) def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) trainer = GANTrainer( model=self.model, input_queue=input_queue, ) self.restore_path = os.path.join(self.model_dir, 'checkpoint') if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, 'stats.json')) as f: starting_epoch = json.load(f)[-1]['epoch_num'] + 1 else: session_init = None starting_epoch = 1 action = 'k' if self.restore_session else None # logger.set_logger_dir(self.log_dir, action=action) callbacks = [] monitors = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) callbacks.append(MergeAllSummaries(period=10)) if self.experiment is not None: monitors.append(CometMLMonitor(experiment=self.experiment)) trainer.train_with_defaults(callbacks=callbacks, monitors=monitors, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling() def sample(self, num_samples): """Generate samples from model. Args: num_samples(int) Returns: None Raises: ValueError """ max_iters = (num_samples // self.batch_size) results = [] for idx, o in enumerate(self.simple_dataset_predictor.get_result()): results.append(o[0]) if idx + 1 == max_iters: break results = np.concatenate(results, axis=0) ptr = 0 features = {} for col_id, col_info in enumerate(self.metadata['details']): if col_info['type'] == 'category': features['f%02d' % col_id] = results[:, ptr:ptr + 1] ptr += 1 elif col_info['type'] == 'value': gaussian_components = col_info['n'] val = results[:, ptr:ptr + 1] ptr += 1 pro = results[:, ptr:ptr + gaussian_components] ptr += gaussian_components features['f%02d' % col_id] = np.concatenate([val, pro], axis=1) else: raise ValueError( "self.metadata['details'][{}]['type'] must be either `category` or " "`values`. Instead it was {}.".format( col_id, col_info['type'])) return self.preprocessor.reverse_transform( features)[:num_samples].copy() def tar_folder(self, tar_name): """Generate a tar of :self.output:.""" with tarfile.open(tar_name, 'w:gz') as tar_handle: for root, dirs, files in os.walk(self.output): for file_ in files: tar_handle.add(os.path.join(root, file_)) tar_handle.close() @classmethod def load(cls, path): """Load a pretrained model from a given path.""" with tarfile.open(path, 'r:gz') as tar_handle: destination_dir = os.path.dirname(tar_handle.getmembers()[0].name) tar_handle.extractall() with open('{}/TGANModel'.format(destination_dir), 'rb+') as f: instance = pickle.load(f) instance.prepare_sampling() return instance def save(self, path, force=False): """Save the fitted model in the given path.""" if os.path.exists(path) and not force: logger.info( 'The indicated path already exists. Use `force=True` to overwrite.' ) return base_path = os.path.dirname(path) if not os.path.exists(base_path): os.makedirs(base_path) model = self.model dataset_predictor = self.simple_dataset_predictor self.model = None self.simple_dataset_predictor = None with open('{}/TGANModel'.format(self.output), 'wb') as f: pickle.dump(self, f) self.model = model self.simple_dataset_predictor = dataset_predictor self.tar_folder(path) logger.info('Model saved successfully.')
class TGANModel: """Main model from TGAN. Args: continuous_columns (list[int]): 0-index list of column indices to be considered continuous. output (str, optional): Path to store the model and its artifacts. Defaults to :attr:`output`. gpu (list[str], optional):Comma separated list of GPU(s) to use. Defaults to :attr:`None`. max_epoch (int, optional): Number of epochs to use during training. Defaults to :attr:`5`. steps_per_epoch (int, optional): Number of steps to run on each epoch. Defaults to :attr:`10000`. save_checkpoints(bool, optional): Whether or not to store checkpoints of the model after each training epoch. Defaults to :attr:`True` restore_session(bool, optional): Whether or not continue training from the last checkpoint. Defaults to :attr:`True`. batch_size (int, optional): Size of the batch to feed the model at each step. Defaults to :attr:`200`. z_dim (int, optional): Number of dimensions in the noise input for the generator. Defaults to :attr:`100`. noise (float, optional): Upper bound to the gaussian noise added to categorical columns. Defaults to :attr:`0.2`. l2norm (float, optional): L2 reguralization coefficient when computing losses. Defaults to :attr:`0.00001`. learning_rate (float, optional): Learning rate for the optimizer. Defaults to :attr:`0.001`. num_gen_rnn (int, optional): Defaults to :attr:`400`. num_gen_feature (int, optional): Number of features of in the generator. Defaults to :attr:`100` num_dis_layers (int, optional): Defaults to :attr:`2`. num_dis_hidden (int, optional): Defaults to :attr:`200`. optimizer (str, optional): Name of the optimizer to use during `fit`,possible values are: [`GradientDescentOptimizer`, `AdamOptimizer`, `AdadeltaOptimizer`]. Defaults to :attr:`AdamOptimizer`. """ def __init__( self, continuous_columns, output="output", gpus=None, max_epoch=5, steps_per_epoch=10000, save_checkpoints=True, restore_session=True, batch_size=200, z_dim=200, noise=0.2, l2norm=0.00001, learning_rate=0.001, num_gen_rnn=100, num_gen_feature=100, num_dis_layers=1, num_dis_hidden=100, optimizer="AdamOptimizer", ): """Initialize object.""" # Output self.continuous_columns = continuous_columns self.log_dir = os.path.join(output, "logs") self.model_dir = os.path.join(output, "model") self.output = output # Training params self.max_epoch = max_epoch self.steps_per_epoch = steps_per_epoch self.save_checkpoints = save_checkpoints self.restore_session = restore_session # Model params self.model = None self.batch_size = batch_size self.z_dim = z_dim self.noise = noise self.l2norm = l2norm self.learning_rate = learning_rate self.num_gen_rnn = num_gen_rnn self.num_gen_feature = num_gen_feature self.num_dis_layers = num_dis_layers self.num_dis_hidden = num_dis_hidden self.optimizer = optimizer self.gpus = gpus or self.get_gpus() if self.gpus: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( list(map(str, self.gpus))) def get_gpus(self): """Setting up GPU.""" return [ x.locality.bus_id for x in device_lib.list_local_devices() if x.device_type == "GPU" ] def get_model(self, training=True): """Return a new instance of the model.""" return GraphBuilder(metadata=self.metadata, batch_size=self.batch_size, z_dim=self.z_dim, noise=self.noise, l2norm=self.l2norm, learning_rate=self.learning_rate, num_gen_rnn=self.num_gen_rnn, num_gen_feature=self.num_gen_feature, num_dis_layers=self.num_dis_layers, num_dis_hidden=self.num_dis_hidden, optimizer=self.optimizer, training=training) def prepare_sampling(self): """Prepare model for generate samples.""" if self.model is None: self.model = self.get_model(training=False) else: self.model.training = False predict_config = PredictConfig( session_init=SaverRestore(self.restore_path), model=self.model, input_names=["z"], output_names=["gen/gen", "z"], ) self.simple_dataset_predictor = SimpleDatasetPredictor( predict_config, RandomZData((self.batch_size, self.z_dim))) def get_trainer(self, model, input_queue): if len(self.gpus) > 1: return MultiGPUGANTrainer(self.gpus) else: return GANTrainer( model=self.model, input_queue=input_queue, ) def fit(self, data): """Fit the model to the given data. Args: data(pandas.DataFrame): dataset to fit the model. Returns: None """ self.preprocessor = Preprocessor( continuous_columns=self.continuous_columns) data = self.preprocessor.fit_transform(data) self.metadata = self.preprocessor.metadata dataflow = TGANDataFlow(data, self.metadata) batch_data = BatchData(dataflow, self.batch_size) input_queue = QueueInput(batch_data) self.model = self.get_model(training=True) trainer = self.get_trainer( model=self.model, input_queue=input_queue, ) self.restore_path = os.path.join(self.model_dir, "checkpoint") if os.path.isfile(self.restore_path) and self.restore_session: session_init = SaverRestore(self.restore_path) with open(os.path.join(self.log_dir, "stats.json")) as f: starting_epoch = json.load(f)[-1]["epoch_num"] + 1 else: session_init = None starting_epoch = 1 action = "k" if self.restore_session else None logger.set_logger_dir(self.log_dir, action=action) callbacks = [] if self.save_checkpoints: callbacks.append(ModelSaver(checkpoint_dir=self.model_dir)) trainer.train_with_defaults(callbacks=callbacks, steps_per_epoch=self.steps_per_epoch, max_epoch=self.max_epoch, session_init=session_init, starting_epoch=starting_epoch) self.prepare_sampling() def sample(self, num_samples): """Generate samples from model. Args: num_samples(int) Returns: None Raises: ValueError """ max_iters = (num_samples // self.batch_size) results = [] for idx, o in enumerate(self.simple_dataset_predictor.get_result()): results.append(o[0]) if idx + 1 == max_iters: break results = np.concatenate(results, axis=0) ptr = 0 features = {} for col_id, col_info in enumerate(self.metadata["details"]): if col_info["type"] == "category": features["f%02d" % col_id] = results[:, ptr:ptr + 1] ptr += 1 elif col_info["type"] == "value": gaussian_components = col_info["n"] val = results[:, ptr:ptr + 1] ptr += 1 pro = results[:, ptr:ptr + gaussian_components] ptr += gaussian_components features["f%02d" % col_id] = np.concatenate([val, pro], axis=1) else: raise ValueError( "self.metadata['details'][{}]['type'] must be either `category` or " "`values`. Instead it was {}.".format( col_id, col_info["type"])) return self.preprocessor.reverse_transform( features)[:num_samples].copy() def tar_folder(self, tar_name): """Generate a tar of :self.output:.""" with tarfile.open(tar_name, "w:gz") as tar_handle: for root, dirs, files in os.walk(self.output): for file_ in files: tar_handle.add(os.path.join(root, file_)) tar_handle.close() @classmethod def load(cls, path): """Load a pretrained model from a given path.""" with tarfile.open(path, "r:gz") as tar_handle: destination_dir = os.path.dirname(tar_handle.getmembers()[0].name) tar_handle.extractall() with open("{}/TGANModel".format(destination_dir), "rb") as f: instance = pickle.load(f) instance.prepare_sampling() return instance def save(self, path, force=False): """Save the fitted model in the given path.""" if os.path.exists(path) and not force: logger.info( "The indicated path already exists. Use `force=True` to overwrite." ) return base_path = os.path.dirname(path) if not os.path.exists(base_path): os.makedirs(base_path) model = self.model dataset_predictor = self.simple_dataset_predictor self.model = None self.simple_dataset_predictor = None with open("{}/TGANModel".format(self.output), "wb") as f: pickle.dump(self, f) self.model = model self.simple_dataset_predictor = dataset_predictor self.tar_folder(path) logger.info("Model saved successfully.")