def test_data_manager_copy(self): def train_data_xform(x): return x*x def train_label_xform(y): return y*y*y def test_data_xform(x): return x**2 def test_label_xform(y): return y + 2 dat1 = tpmd.DataManager(self.experiment_path, self.train_file, self.clean_test_file, triggered_test_file=self.triggered_file, train_data_transform=train_data_xform, train_label_transform=train_label_xform, test_data_transform=test_data_xform, test_label_transform=test_label_xform, file_loader='image', shuffle_train=True, shuffle_clean_test=False, shuffle_triggered_test=False) dat2 = copy.deepcopy(dat1) self.assertEqual(dat1, dat2) self.assertEqual(train_data_xform, dat1.train_data_transform) self.assertEqual(train_data_xform, dat2.train_data_transform) self.assertEqual(train_label_xform, dat1.train_label_transform) self.assertEqual(train_label_xform, dat2.train_label_transform) self.assertEqual(test_data_xform, dat1.test_data_transform) self.assertEqual(test_data_xform, dat2.test_data_transform) self.assertEqual(test_label_xform, dat1.test_label_transform) self.assertEqual(test_label_xform, dat2.test_label_transform)
def test_model_generator_config_copy(self): class MyArchFactory(tpmaf.ArchitectureFactory): def new_architecture(self): return tpma.ModdedLeNet5Net(channels=1) arch = MyArchFactory() # setup the xforms to ensure we can test the callables def data_xform(x): return x * x def label_xform(y): return y * y * y data = tpmd.DataManager(self.experiment_path, self.train_file, self.clean_test_file, triggered_test_file=self.triggered_file, data_transform=data_xform, label_transform=label_xform, file_loader='image', shuffle_train=True, shuffle_clean_test=False, shuffle_triggered_test=False) num_models = 1 mgc1 = tpmc.ModelGeneratorConfig(arch, data, self.model_save_dir, self.stats_save_dir, num_models) mgc2 = copy.deepcopy(mgc1) self.assertEqual(mgc1, mgc2)
def train_and_save_mnist_model(experiment_path, clean_train, triggered_train, clean_test, triggered_test, model_save_dir, parallel, use_gpu): logger.info("Training Model...") def img_transform(x): return x.unsqueeze(0) logging_params = { 'num_batches_per_logmsg': 500, 'tensorboard_output_dir': 'tensorboard_dir/', 'experiment_name': 'badnets', 'num_batches_per_metrics': 500, 'num_batches_ver_val_dataset_metrics': None, 'num_epochs_per_metric': 10 } logging_cfg = tpmc.ReportingConfig(num_batches_per_logmsg=logging_params['num_batches_per_logmsg'], tensorboard_output_dir=logging_params['tensorboard_output_dir'], experiment_name=logging_params['experiment_name'], num_batches_per_metrics=logging_params['num_batches_per_metrics'], num_batches_ver_val_dataset_metrics=logging_params[ 'num_batches_ver_val_dataset_metrics'], num_epochs_per_metric=logging_params['num_epochs_per_metric']) # Train clean model to use as a base for triggered model device = torch.device('cuda' if use_gpu else 'cpu') data_obj = tpm_tdm.DataManager(experiment_path, [clean_train, triggered_train], clean_test, triggered_test_file=triggered_test, data_transform=img_transform, shuffle_train=True) class MyArchFactory(tpm_af.ArchitectureFactory): def new_architecture(self): return tpma.BadNetExample() clean_training_cfg = tpmc.TrainingConfig(device=device, epochs=10, batch_size=100, lr=1e-4) clean_optim_cfg = tpmc.DefaultOptimizerConfig(clean_training_cfg, logging_cfg) clean_optim = tpm_do.DefaultOptimizer(clean_optim_cfg) triggered_training_cfg = tpmc.TrainingConfig(device=device, epochs=200, batch_size=15, lr=1e-4) triggered_optim_cfg = tpmc.DefaultOptimizerConfig(triggered_training_cfg, logging_cfg) triggered_optim = tpm_do.DefaultOptimizer(triggered_optim_cfg) optims = [clean_optim, triggered_optim] model_filename = 'BadNets_0.2_poison_sequential.pt' cfg = tpmc.RunnerConfig(MyArchFactory(), data_obj, optimizer=optims, model_save_dir=model_save_dir, stats_save_dir=model_save_dir, filename=model_filename, parallel=parallel) runner = tpmr.Runner(cfg, {'script': 'gen_and_train_mnist_sequential.py'}) runner.run()
def train_and_save_mnist_model(experiment_path, triggered_train, clean_test, triggered_test, model_save_dir, parallel, use_gpu): logger.info("Training Model...") def img_transform(x): return x.unsqueeze(0) logging_params = { 'num_batches_per_logmsg': 500, 'tensorboard_output_dir': 'tensorboard_dir/', 'experiment_name': 'badnets', 'num_batches_per_metrics': 500, 'num_epochs_per_metric': 10 } logging_cfg = tpmc.ReportingConfig( num_batches_per_logmsg=logging_params['num_batches_per_logmsg'], tensorboard_output_dir=logging_params['tensorboard_output_dir'], experiment_name=logging_params['experiment_name'], num_batches_per_metrics=logging_params['num_batches_per_metrics'], num_epochs_per_metric=logging_params['num_epochs_per_metric']) # Train clean model to use as a base for triggered model device = torch.device('cuda' if use_gpu else 'cpu') num_avail_cpus = multiprocessing.cpu_count() num_cpus_to_use = int(.8 * num_avail_cpus) data_obj = tpm_tdm.DataManager( experiment_path, triggered_train, clean_test, triggered_test_file=triggered_test, train_data_transform=img_transform, test_data_transform=img_transform, shuffle_train=True, train_dataloader_kwargs={'num_workers': num_cpus_to_use}) class MyArchFactory(tpm_af.ArchitectureFactory): def new_architecture(self): return tpma.ModdedLeNet5Net() training_cfg = tpmc.TrainingConfig( device=device, epochs=300, batch_size=20, lr=1e-4, early_stopping=tpmc.EarlyStoppingConfig()) optim_cfg = tpmc.DefaultOptimizerConfig(training_cfg, logging_cfg) optim = tpm_do.DefaultOptimizer(optim_cfg) model_filename = 'ModdedLeNet5_0.2_poison.pt' cfg = tpmc.RunnerConfig(MyArchFactory(), data_obj, optimizer=optim, model_save_dir=model_save_dir, stats_save_dir=model_save_dir, filename=model_filename, parallel=parallel) runner = tpmr.Runner(cfg, {'script': 'gen_and_train_mnist.py'}) runner.run()
def test_data_manager_eq(self, f): exp1 = '/tmp/experiment1' train1 = '/tmp/train1' clean1 = '/tmp/clean1' trig1 = '/tmp/trig1' data_type = 'image' data_xform = (lambda x: x) label_xform = (lambda y: y) data_xform2 = (lambda x: x * x) label_xform2 = (lambda y: y + y) data_xform3 = (lambda x: x) data_loader = 'image' shuffle_train = True shuffle_clean_test = True shuffle_triggered_test = True dm1 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) dm2 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) # test string comparison difference dm3 = tpmdm.DataManager(exp1, '/tmp/train2', clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) # test callable comparison difference dm4 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform2, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) # test callable comparison difference dm5 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform2, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) dm6 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform3, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test) self.assertEqual(dm1, dm2) self.assertNotEqual(dm1, dm3) self.assertNotEqual(dm1, dm4) self.assertNotEqual(dm1, dm5)
num_avail_cpus = multiprocessing.cpu_count() num_cpus_to_use = int(.8 * num_avail_cpus) modelgen_cfgs = [] for i in range(len(experiment_list)): experiment_cfg = experiment_list[i] experiment_name = experiment_name_list[i] logger.debug(experiment_name) data_obj = tpm_tdm.DataManager( my_experiment_path, experiment_cfg['train_file'], experiment_cfg['clean_test_file'], triggered_test_file=experiment_cfg['triggered_test_file'], train_data_transform=img_transform, test_data_transform=img_transform, shuffle_train=True, train_dataloader_kwargs={'num_workers': num_cpus_to_use}) model_save_dir = os.path.join(model_save_root_dir, experiment_cfg['model_save_dir']) stats_save_dir = os.path.join(model_save_root_dir, experiment_cfg['stats_save_dir']) num_models = 1 device = torch.device( 'cuda' if torch.cuda.is_available() and a.gpu else 'cpu') default_nbpvdm = None if device.type == 'cpu' else 500
def train_models(top_dir, data_folder, experiment_folder, experiment_list, model_save_folder, stats_save_folder, early_stopping, train_val_split, tensorboard_dir, gpu, uge, uge_dir): """ Given paths to the experiments and specifications to where models and model statistics should be saved, create triggered models for each experiment in the experiment directory. :param top_dir: (str) path to top level directory for text classification data and models are to be stored :param data_folder: (str) name of folder containing the experiments folder :param experiment_folder: (str) name of folder containing the experiments used to generate models :param model_save_folder: (str) name of folder under which models are to be saved :param stats_save_folder: (str) name of folder under which model training information is to be saved :param tensorboard_dir: (str) name of folder under which tensorboard information is to be saved :param gpu: (bool) use a gpu in training :param uge: (bool) use a Univa Grid Engine (UGE) to generate models :param uge_dir: (str) working directory for UGE models :return: None """ class MyArchFactory(tpm_af.ArchitectureFactory): def new_architecture(self, input_dim=25000, embedding_dim=100, hidden_dim=256, output_dim=1, n_layers=2, bidirectional=True, dropout=0.5, pad_idx=-999): return tpta.EmbeddingLSTM(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx) def arch_factory_kwargs_generator(train_dataset_desc, clean_test_dataset_desc, triggered_test_dataset_desc): # Note: the arch_factory_kwargs_generator returns a dictionary, which is used as kwargs input into an # architecture factory. Here, we allow the input-dimension and the pad-idx to be set when the model gets # instantiated. This is useful because these indices and the vocabulary size are not known until the # vocabulary is built. output_dict = dict(input_dim=train_dataset_desc.vocab_size, pad_idx=train_dataset_desc.pad_idx) return output_dict # get all available experiments from the experiment root directory experiment_path = os.path.join(top_dir, data_folder, experiment_folder) modelgen_cfgs = [] arch_factory_kwargs = dict( input_dim=25000, embedding_dim=100, hidden_dim=256, output_dim=1, n_layers=2, bidirectional=True, dropout=0.5 ) for i in range(len(experiment_list)): experiment_cfg = experiment_list[i] data_obj = dm.DataManager(experiment_path, experiment_cfg['train_file'], experiment_cfg['clean_test_file'], data_type='text', triggered_test_file=experiment_cfg['triggered_test_file'], shuffle_train=True, data_configuration=dc.TextDataConfiguration( max_vocab_size=arch_factory_kwargs['input_dim'], embedding_dim=arch_factory_kwargs['embedding_dim'])) num_models = 5 if uge: if gpu: device = torch.device('cuda') else: device = torch.device('cpu') else: device = torch.device('cuda' if torch.cuda.is_available() and gpu else 'cpu') default_nbpvdm = None if device.type == 'cpu' else 500 early_stopping_argin = tpmc.EarlyStoppingConfig() if early_stopping else None training_params = tpmc.TrainingConfig(device=device, epochs=10, batch_size=64, lr=1e-3, optim='adam', objective='BCEWithLogitsLoss', early_stopping=early_stopping_argin, train_val_split=train_val_split) reporting_params = tpmc.ReportingConfig(num_batches_per_logmsg=100, num_epochs_per_metric=1, num_batches_per_metrics=default_nbpvdm, tensorboard_output_dir=tensorboard_dir, experiment_name=experiment_cfg['name']) lstm_optimizer_config = tpmc.TorchTextOptimizerConfig(training_cfg=training_params, reporting_cfg=reporting_params, copy_pretrained_embeddings=True) optimizer = tptto.TorchTextOptimizer(lstm_optimizer_config) # There seem to be some issues w/ using the DataParallel w/ RNN's (hence, parallel=False). # See here: # - https://discuss.pytorch.org/t/pack-padded-sequence-with-multiple-gpus/33458 # - https://pytorch.org/docs/master/notes/faq.html#pack-rnn-unpack-with-data-parallelism # - https://github.com/pytorch/pytorch/issues/10537 # Although these issues are "old," the solutions provided in these forums haven't yet worked # for me to try to resolve the data batching error. For now, we suffice to using the single # GPU version. cfg = tpmc.ModelGeneratorConfig(MyArchFactory(), data_obj, model_save_folder, stats_save_folder, num_models, arch_factory_kwargs=arch_factory_kwargs, arch_factory_kwargs_generator=arch_factory_kwargs_generator, optimizer=optimizer, experiment_cfg=experiment_cfg, parallel=False, save_with_hash=True) # may also provide lists of run_ids or filenames as arguments to ModelGeneratorConfig to have more control # of saved model file names; see RunnerConfig and ModelGeneratorConfig for more information modelgen_cfgs.append(cfg) if uge: if gpu: q1 = tpmc.UGEQueueConfig("gpu-k40.q", True) q2 = tpmc.UGEQueueConfig("gpu-v100.q", True) q_cfg = tpmc.UGEConfig([q1, q2], queue_distribution=None) else: q1 = tpmc.UGEQueueConfig("htc.q", False) q_cfg = tpmc.UGEConfig(q1, queue_distribution=None) working_dir = uge_dir try: shutil.rmtree(working_dir) except IOError: pass model_generator = ugemg.UGEModelGenerator(modelgen_cfgs, q_cfg, working_directory=working_dir) else: model_generator = mg.ModelGenerator(modelgen_cfgs) start = time.time() model_generator.run() logger.debug("Time to run: ", (time.time() - start) / 60 / 60, 'hours')
def test_data_manager_eq(self, f): exp1 = '/tmp/experiment1' train1 = '/tmp/train1' clean1 = '/tmp/clean1' trig1 = '/tmp/trig1' data_type = 'image' data_xform = (lambda x: x) label_xform = (lambda y: y) data_xform2 = (lambda x: x * x) label_xform2 = (lambda y: y + y) data_xform3 = (lambda x: x) data_loader = 'image' shuffle_train = True shuffle_clean_test = True shuffle_triggered_test = True train_dataloader_kwargs1 = {"a": 1} train_dataloader_kwargs2 = {"b": 2, "c": 3} test_dataloader_kwargs1 = {"d": 4} test_dataloader_kwargs2 = {"e": 5} dm1 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) dm2 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) # test string comparison difference dm3 = tpmdm.DataManager(exp1, '/tmp/train2', clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) # test callable comparison difference dm4 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform2, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) # test callable comparison difference dm5 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform2, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) dm6 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform3, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs1) # test different dataloader kwargs dm7 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs2, test_dataloader_kwargs2) dm8 = tpmdm.DataManager(exp1, train1, clean1, trig1, data_type, data_xform, label_xform, data_loader, shuffle_train, shuffle_clean_test, shuffle_triggered_test, train_dataloader_kwargs1, test_dataloader_kwargs2) self.assertEqual(dm1, dm2) self.assertNotEqual(dm1, dm3) self.assertNotEqual(dm1, dm4) self.assertNotEqual(dm1, dm5) # NOTE: this fails because the lambda functions are loaded in different locations in memory, even # though they are functionally equivalent. I'm not sure how to resolve this # self.assertEqual(dm1, dm6) self.assertNotEqual(dm1, dm7) self.assertNotEqual(dm1, dm8)