def main(): print() if torch.cuda.is_available(): DEVICE = torch.device( "cuda:0" ) # you can continue going on here, like cuda:1 cuda:2....etc. print("Running on the GPU") else: DEVICE = torch.device("cpu") print("Running on the CPU") model = CNN().to(DEVICE) (train_loader, valid_loader, test_loader) = get_data_loaders() # Fit model model, train_history, _, best_epoch = fit(model=model, data=(train_loader, valid_loader), device=DEVICE) # Test results test_loss, test_acc = eval_model(model, test_loader, DEVICE) print('\nTest loss: {:.3f} |'.format(test_loss.item()) + ' Test Acc: {:.3f}'.format(test_acc)) results_test = [test_loss.item(), test_acc] np.savetxt('results.txt', results_test, fmt='%.3f', delimiter=',') print("\n\nDONE!")
def main(args): """Train fear extinction CNN. """ start_time = time.time() print('-' * 80) log_args(args) print('-' * 80) model = load_model(args.model, args.pretrained) model = model.to(device) print('Model loaded.') split_across(args.mouse_num, args.datadir, args.mIDs) print('Train / test sets created (if necessary).') train_loader, test_loader = get_data_loaders(args.mouse_num, args.augment, args.model, args.batch_size, args.num_workers, args.pin_memory, args.datadir) print('Data loaded.') print('Training.') print('-' * 80) criterion = nn.CrossEntropyLoss().to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.n_epochs + 1): train_loss = train(model, criterion, optimizer, train_loader) test_loss, pct_correct, f0, f1, t0, t1 = test(model, criterion, test_loader) out_file = args.name + '_pretrained' + str(int(args.pretrained==True)) + '.csv' out_path = os.path.join(args.directory, out_file) msg = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (epoch, round(train_loss,2), round(test_loss,2), pct_correct,f0,f1,t0,t1) print(msg) msg2 = ','.join([str(epoch), str(round(train_loss,2)), str(round(test_loss,2)), str(pct_correct), str(f0), str(f1), str(t0), str(t1)]) #round(test_loss,2), pct_correct, f0, f1, t0, t1]) with open(out_path, 'a+') as outfile: outfile.write(msg2 + '\n') if epoch % SAVE_MODEL_EVERY == 0: name = args.name + '_epoch%s' %(str(epoch)) save_model(args.directory, model, name) hours = round((time.time() - start_time) / 3600, 1) print('Job complete in %s hrs.' % hours) save_model(args.directory, model, args.name) print('Model saved.')
def __init__(self, config, args): self.config = config for k, v in args.__dict__.items(): setattr(self.config, k, v) setattr(self.config, 'save_dir', '{}_log'.format(self.config.dataset)) disp_str = '' for attr in sorted(dir(self.config), key=lambda x: len(x)): if not attr.startswith('__'): disp_str += '{} : {}\n'.format(attr, getattr(self.config, attr)) sys.stdout.write(disp_str) sys.stdout.flush() self.labeled_loader, self.unlabeled_loader, self.dev_loader, self.special_set = \ data.get_data_loaders(config) self.dis = model.Discriminative(config).cuda() self.gen = model.Generator(image_side=config.image_side, noise_size=config.noise_size).cuda() self.enc = model.Encoder(config.image_side, noise_size=config.noise_size, output_params=True).cuda() self.dis_optimizer = optim.Adam(self.dis.parameters(), lr=config.dis_lr, betas=(0.5, 0.999)) self.gen_optimizer = optim.Adam(self.gen.parameters(), lr=config.gen_lr, betas=(0.0, 0.999)) self.enc_optimizer = optim.Adam(self.enc.parameters(), lr=config.enc_lr, betas=(0.0, 0.999)) self.d_criterion = nn.CrossEntropyLoss() if not os.path.exists(self.config.save_dir): os.makedirs(self.config.save_dir) log_path = os.path.join( self.config.save_dir, '{}.FM+VI.{}.txt'.format(self.config.dataset, self.config.suffix)) self.logger = open(log_path, 'wb') self.logger.write(disp_str) print self.dis
def test_BachNet(): data_loaders = data.get_data_loaders( batch_size=training.std_config.batch_size, num_workers=training.std_config.num_workers, time_grid=training.std_config.time_grid, context_radius=training.std_config.context_radius, split=training.std_config.split, debug=True, overwrite=True) training.std_config.num_epochs = 1 training.train(training.std_config, data_loaders) cp_dirname = sorted(glob('checkpoints/*/'))[-1] last_subdir = os.path.basename(os.path.normpath(cp_dirname)) cp_path = cp_dirname + last_subdir + '_epoch=0001.pt' soprano_path = 'data/musicxml/001_soprano.xml' score = inference.compose_score(cp_path, soprano_path)
def main(args): set_cuda(args) set_seed(args) loader_train, loader_val, loader_test = get_data_loaders(args) loss = get_loss(args) model = get_model(args) optimizer = get_optimizer(args, parameters=model.parameters()) xp = get_xp(args, model, optimizer) for i in range(args.epochs): xp.Epoch.update(1).log() train(model, loss, optimizer, loader_train, xp, args) test(model, loader_val, xp, args) if (i + 1) in args.T: decay_optimizer(optimizer, args.decay_factor) load_best_model(model, xp) test(model, loader_test, xp, args)
def main(args): set_cuda(args) set_seed(args) loader_train, loader_val, loader_test = get_data_loaders(args) loss = get_loss(args) model = get_model(args) optimizer = get_optimizer(args, model, loss, parameters=model.parameters()) xp = setup_xp(args, model, optimizer) for i in range(args.epochs): xp.epoch.update(i) train(model, loss, optimizer, loader_train, args, xp) test(model, optimizer, loader_val, args, xp) if (i + 1) in args.T: decay_optimizer(optimizer, args.decay_factor) load_best_model(model, '{}/best_model.pkl'.format(args.xp_name)) test(model, optimizer, loader_val, args, xp) test(model, optimizer, loader_test, args, xp)
def tune(cfg): train_loader, valid_loader = get_data_loaders(cfg["data"]) model = ResNet(cfg) for i in range(cfg["tune"]["num_epoch"]): for j, (images, attr) in enumerate(train_loader): attr = attr.type(torch.float) images = images.cuda(cfg["GPU"]["name"]) if cfg["GPU"]["enable"] else images attr = attr.cuda(cfg["GPU"]["name"]) if cfg["GPU"]["enable"] else attr ac, f1, loss = model.step(images, attr) print("\r Done: {}/{} acc {} f1 {} loss {}". format( j * cfg["data"]["batch_size"], len(train_loader) * cfg["data"]["batch_size"], ac, f1, loss), end='') print() model.save(i) val_lbl = [] val_pred = [] model.set_mode("eval") for images, attr in valid_loader: attr = attr.type(torch.float) images = images.cuda(cfg["GPU"]["name"]) if cfg["GPU"]["enable"] else images attr = attr.cuda(cfg["GPU"]["name"]) if cfg["GPU"]["enable"] else attr pred = model.predict(images) val_lbl.append(attr.detach().cpu()) val_pred.append(pred.detach().cpu()) ac, f1 = model.metrics(torch.cat(val_lbl, dim=0).numpy(), torch.cat(val_pred, dim=0).numpy()) print("{} test acc {}, f1 {}".format(i + 1, ac, f1)) model.set_mode("train")
def _get_dataloaders(self): train_dataloader, val_dataloader = get_data_loaders(self.config) return train_dataloader, val_dataloader
def run(args): if args.train: print(f"Training over {args.epochs} epochs") elif args.test: print("Running a full evaluation") else: print("Running inference speed test") print("model:\t\t", args.model) print("dataset:\t", args.dataset) print("batch_size:\t", args.batch_size) hook = sy.TorchHook(torch) if args.websockets: alice = DataCentricFLClient(hook, "ws://localhost:7600") bob = DataCentricFLClient(hook, "ws://localhost:7601") crypto_provider = DataCentricFLClient(hook, "ws://localhost:7602") my_grid = sy.PrivateGridNetwork(alice, bob, crypto_provider) sy.local_worker.object_store.garbage_delay = 1 else: bob = sy.VirtualWorker(hook, id="bob") alice = sy.VirtualWorker(hook, id="alice") crypto_provider = sy.VirtualWorker(hook, id="crypto_provider") workers = [alice, bob] sy.local_worker.clients = workers encryption_kwargs = dict(workers=workers, crypto_provider=crypto_provider, protocol=args.protocol) kwargs = dict( requires_grad=args.requires_grad, precision_fractional=args.precision_fractional, dtype=args.dtype, **encryption_kwargs, ) if args.preprocess: build_prepocessing(args.model, args.dataset, args.batch_size, workers, args) private_train_loader, private_test_loader = get_data_loaders(args, kwargs, private=True) public_train_loader, public_test_loader = get_data_loaders(args, kwargs, private=False) model = get_model(args.model, args.dataset, out_features=get_number_classes(args.dataset)) if args.test and not args.train: load_state_dict(model, args.model, args.dataset) model.eval() if torch.cuda.is_available(): sy.cuda_force = True if not args.public: model.encrypt(**kwargs) if args.fp_only: # Just keep the (Autograd+) Fixed Precision feature model.get() if args.train: for epoch in range(args.epochs): optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) if not args.public: optimizer = optimizer.fix_precision( precision_fractional=args.precision_fractional, dtype=args.dtype) train_time = train(args, model, private_train_loader, optimizer, epoch) test_time, accuracy = test(args, model, private_test_loader) else: test_time, accuracy = test(args, model, private_test_loader) if not args.test: print( f"{ 'Online' if args.preprocess else 'Total' } time (s):\t", round(test_time / args.batch_size, 4), ) else: # Compare with clear text accuracy print("Clear text accuracy is:") model = get_model(args.model, args.dataset, out_features=get_number_classes(args.dataset)) load_state_dict(model, args.model, args.dataset) test(args, model, public_test_loader) if args.preprocess: missing_items = [len(v) for k, v in sy.preprocessed_material.items()] if sum(missing_items) > 0: print("MISSING preprocessed material") for key, value in sy.preprocessed_material.items(): print(f"'{key}':", value, ",")
""" import os from options.test_options import TestOptions from data import get_data_loaders from models import create_model from util import html if __name__ == '__main__': opt = TestOptions().parse() # get test options # hard-code some parameters for test opt.num_threads = 0 # test code only supports num_threads = 1 opt.batch_size = 1 # test code only supports batch_size = 1 opt.shuffle_data = False # disable data shuffling; opt.display_id = -1 # no visdom display; the test code optionally saves the results to a HTML file. dataloader = get_data_loaders( opt) # create a dataset given opt.dataset_mode and other options model = create_model( opt) # create a model given opt.model and other options model.setup( opt) # regular setup: load and print networks; create schedulers # create a website web_dir = os.path.join(opt.results_dir, opt.name, '{}'.format( opt.epoch)) # define the website directory print('creating web directory', web_dir) webpage = html.HTML(web_dir, 'Experiment = %s, Epoch = %s' % (opt.name, opt.epoch)) # test with eval mode. This only affects layers like batchnorm and dropout. if opt.eval: model.eval()
if __name__ == '__main__': use_cuda = torch.cuda.is_available() class p: pass p.seed = 13 p.batch_size = 100 p.dset = 'imagenet' # get data t = time.clock() seed(p) train_loader, val_loader = data.get_data_loaders(p) print(len(train_loader), len(val_loader), p.batch_size) # set up saving out_dir = '/accounts/projects/vision/scratch/yu_dl/raaz.rsk/cnns_preds' os.makedirs(out_dir, exist_ok=True) # save the labels out_file_labs = oj(out_dir, 'labs' + '.h5') if os.path.exists(out_file_labs): os.remove(out_file_labs) f2 = h5py.File(out_file_labs, "w") f2.create_dataset("labs_train", (len(train_loader) * p.batch_size, ), dtype=np.int32) f2.create_dataset("labs_val", (len(val_loader) * p.batch_size, ), dtype=np.int32)
def fit_vision(p): out_name = p._str(p) # generate random fname str before saving seed(p) use_cuda = torch.cuda.is_available() device = 'cuda' if use_cuda else 'cpu' # pick dataset and model print('loading dset...') train_loader, test_loader = data.get_data_loaders(p) X_train, Y_train_onehot = data.get_XY(train_loader) model = data.get_model(p, X_train, Y_train_onehot) init.initialize_weights(p, X_train, Y_train_onehot, model) # set up optimizer and freeze appropriate layers model, optimizer = optimization.freeze_and_set_lr(p, model, it=0) def reg_init(p): if p.lambda_reg == 0: return None # load the gan gan_dir = '/accounts/projects/vision/chandan/gan/mnist_dcgan' sys.path.insert(1, gan_dir) from dcgan import Discriminator D = Discriminator( ngpu=1 if torch.cuda.is_available() else 0).to(device) D.load_state_dict( torch.load(oj(gan_dir, 'weights/netD_epoch_99.pth'), map_location=device)) D = D.eval() return D def reg(p, it, model, D, device): if p.lambda_reg == 0: return 0 exs = model.exs.reshape(model.exs.shape[0], 1, 28, 28) # mnist-specific outputs = D(exs) # discriminator outputs 1 for real, 0 for fake loss = p.lambda_reg * torch.sum(1 - outputs) return loss model = model.to(device) criterion = nn.CrossEntropyLoss() if 'linear' in p.dset: criterion = nn.MSELoss() reg_model = reg_init(p) # things to record s = S(p) s.weight_names = models.get_weight_names(model) if p.siamese: s.exs = model.exs.data.cpu().numpy() # run print('training...') for i, it in enumerate(tqdm(range(0, p.num_iters))): # calc stats and record s.losses_train[it], s.accs_train[it], s.confidence_unn_train[ it], s.confidence_norm_train[it], s.margin_unn_train[ it], s.margin_norm_train[it] = stats.calc_loss_acc_margins( train_loader, p.batch_size, use_cuda, model, criterion, p.dset) s.losses_test[it], s.accs_test[it], s.confidence_unn_test[ it], s.confidence_norm_test[it], s.margin_unn_test[ it], s.margin_norm_test[it] = stats.calc_loss_acc_margins( test_loader, p.batch_size, use_cuda, model, criterion, p.dset, print_loss=True) # record weights weight_dict = deepcopy( {x[0]: x[1].data.cpu().numpy() for x in model.named_parameters()}) s.weights_first10[p.its[it]] = deepcopy( model.state_dict()[s.weight_names[0]][:20].cpu().numpy()) s.weight_norms[p.its[it]] = stats.layer_norms(model.state_dict()) if it % p.save_all_weights_freq == 0 or it == p.num_iters - 1 or it == 0 or ( it < p.num_iters_small and it % 2 == 0): # save first, last, jumps s.weights[p.its[it]] = weight_dict if not p.use_conv: s.mean_max_corrs[p.its[it]] = stats.calc_max_corr_input( X_train, Y_train_onehot, model) if p.save_singular_vals: # weight singular vals s.singular_val_dicts.append( get_singular_vals_from_weight_dict(weight_dict)) s.singular_val_dicts_cosine.append( get_singular_vals_kernels(weight_dict, 'cosine')) s.singular_val_dicts_rbf.append( get_singular_vals_kernels(weight_dict, 'rbf')) s.singular_val_dicts_lap.append( get_singular_vals_kernels(weight_dict, 'laplacian')) # activations singular vals act_var_dicts = calc_activation_dims( use_cuda, model, train_loader.dataset, test_loader.dataset, calc_activations=p.calc_activations) s.act_singular_val_dicts_train.append( act_var_dicts['train']['pca']) s.act_singular_val_dicts_test.append(act_var_dicts['test']['pca']) s.act_singular_val_dicts_train_rbf.append( act_var_dicts['train']['rbf']) s.act_singular_val_dicts_test_rbf.append( act_var_dicts['test']['rbf']) # reduced model if p.save_reduce: model_r = reduce_model(model) s.losses_train_r[it], s.accs_train_r[ it] = stats.calc_loss_acc_margins(train_loader, p.batch_size, use_cuda, model_r, criterion, p.dset)[:2] s.losses_test_r[it], s.accs_test_r[ it] = stats.calc_loss_acc_margins(test_loader, p.batch_size, use_cuda, model_r, criterion, p.dset)[:2] # training for batch_idx, (x, target) in enumerate(train_loader): optimizer.zero_grad() x = x.to(device) target = target.to(device) x, target = Variable(x), Variable(target) out = model(x) loss = criterion(out, target) + reg(p, it, model, reg_model, device) loss.backward() optimizer.step() # don't go through whole dataset if batch_idx > len( train_loader ) / p.saves_per_iter and it <= p.saves_per_iter * p.saves_per_iter_end + 1: break # set lr / freeze if it - p.num_iters_small in p.lr_ticks: model, optimizer = optimization.freeze_and_set_lr(p, model, it) if it % p.save_all_freq == 0: save(out_name, p, s) # check for need to flip dset if 'flip' in p.dset and it == p.num_iters // 2: print('flipped dset') s.flip_iter = p.num_iters // 2 # flip_iter tells when dset flipped train_loader, test_loader = data.get_data_loaders(p, it=s.flip_iter) X_train, Y_train_onehot = data.get_XY(train_loader) if p.flip_freeze: p.freeze = 'last' model, optimizer = optimization.freeze_and_set_lr(p, model, it) elif 'permute' in p.dset and it > 0 and p.its[it] % p.change_freq == 0: s.permute_rng.append(int(p.its[it])) train_loader, test_loader = data.get_data_loaders( p, it=s.permute_rng[-1]) X_train, Y_train_onehot = data.get_XY(train_loader) save(out_name, p, s)
def train(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) print( "Running process {}".format(args.local_rank) ) # This is a logger.warning: it will be printed by all distributed processes print("Arguments: {}".format(pformat(args))) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') print("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) print("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch print('LM:', lm_labels) print('MC:', mc_labels) (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # print(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model) }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
import os from options.test_options import TestOptions from data import get_data_loaders from models import create_model from util.visualizer import save_images from util import html if __name__ == '__main__': opt = TestOptions().parse() opt.nThreads = 1 # test code only supports nThreads = 1 opt.batch_size = 1 # test code only supports batch_size = 1 opt.serial_batches = True # no shuffle opt.no_flip = True # no flip opt.display_id = -1 # no visdom display data_loaders = get_data_loaders(opt) dataset = data_loaders['test'] dataset_size = len(dataset) model = create_model(opt) model.setup(opt) # create website web_dir = os.path.join(opt.results_dir, opt.name, '%s_%s' % (opt.phase, opt.which_epoch)) webpage = html.HTML( web_dir, 'Experiment = %s, Phase = %s, Epoch = %s' % (opt.name, opt.phase, opt.which_epoch)) # test for i, data in enumerate(dataset): if i >= opt.how_many: break model.set_input(data) model.test()
std_config = utils.Config({ 'num_epochs': 3000, 'batch_size': 8192, 'num_workers': 1, 'hidden_size': 650, 'context_radius': 32, 'time_grid': 0.25, 'lr': 0.0005, 'lr_gamma': 0.99, 'lr_step_size': 30, 'checkpoint_interval': 1, 'split': 0.05, }) if __name__ == '__main__': logging.basicConfig(level=logging.ERROR) logging.debug('Loading datasets...') data_loaders = data.get_data_loaders( batch_size=std_config.batch_size, num_workers=std_config.num_workers, time_grid=std_config.time_grid, context_radius=std_config.context_radius, split=std_config.split, debug=False ) train(std_config, data_loaders)
def main(): """ Main function to train a model with given cmdline options """ opt = TrainOptions().parse() # get training options dataloader = get_data_loaders( opt) # create a dataset given opt.dataset_mode and other options dataset_size = len( dataloader) # get the number of images in the train set. model = create_model( opt) # create a model given opt.model and other options model.setup( opt) # regular setup: load and print networks; create schedulers model.log_model_info( opt.verbose ) # log model metadata to log file iff opt.logging is enabled model.logger.info( 'The number of training images = {}'.format(dataset_size)) model.logger.info('Num val images = {}'.format(dataloader.len_val_set)) print('The number of training images = {}'.format(dataset_size)) print('Num val images = {}'.format(dataloader.len_val_set)) visualizer = Visualizer( opt) # create a visualizer that display/save images and plots start_time = time.monotonic() for epoch in range(opt.epoch_count, opt.n_epochs + 1): # reset the visualizer: make sure it saves the results to HTML at least once every epoch visualizer.reset() # model.update_learning_rate() # update learning rates in the beginning of every epoch. epoch_start_time = time.monotonic() model.init_epoch() model.train_epoch(dataloader.train_loader) model.validate(dataloader) epoch_end_time = time.monotonic() model.log_parameters(epoch) if epoch % opt.log_freq == 0: # print training losses and save logging information to the disk losses = model.get_epoch_losses() metrics = model.get_epoch_metrics() epoch_time = timedelta(seconds=epoch_end_time - epoch_start_time) visualizer.print_current_losses_and_metrics( epoch, losses, metrics, epoch_time) if opt.display_id > 0: for n, y_dict in model.get_plotting_artifacts().items(): visualizer.line_plot(n, epoch, y_dict, xlabel='epochs', ylabel=n) if epoch % opt.save_epoch_freq == 0: # cache model every <save_epoch_freq> epochs model.logger.info('saving the model at the end of epoch %d' % epoch) model.save_networks('latest') model.save_networks('epoch_%d' % epoch) if opt.verbose: print('End of epoch {} / {} \t Time Taken: {} sec'.format( epoch, opt.n_epochs, timedelta(seconds=epoch_end_time - epoch_start_time))) model.logger.info('End of epoch {} / {} \t Time Taken: {} sec'.format( epoch, opt.n_epochs, timedelta(seconds=epoch_end_time - epoch_start_time))) model.update_learning_rate() model.logger.info('Total training time for {} epochs = {}s'.format( opt.n_epochs, timedelta(seconds=time.monotonic() - start_time))) model.save_networks('epoch_{}_final'.format(opt.n_epochs))
NUM_MATERIALS = 3 TEST_MATERIALS = [0, 1, 2] else: NUM_MATERIALS = 4 TEST_MATERIALS = [0, 1, 2, 3] results = [] for PAI in TEST_MATERIALS: netD = DISCRIMINATOR().to(DEVICE) netG = GENERATOR().to(DEVICE) print("[Dataset] - " + DATASET + " -> Material number " + str(PAI)) train_loader, valid_loader, test_loader = get_data_loaders(IMG_PATH, DATASET, test_material = PAI, img_size = IMG_SIZE, batch_size = BATCH_SIZE, croped=True, unseen_attack=UNSEEN_ATTACK) #netD, train_history = fit((netD, netG), DATASET, PAI, (train_loader, valid_loader), EPOCHS, EPOCHS_WITH_MATCHER, DEVICE, with_generator = USE_GENERATOR) netD, train_history = fit((netD, netG), DATASET, PAI, (train_loader, valid_loader), EPOCHS, EPOCHS_WITH_MATCHER, DEVICE, with_generator = USE_GENERATOR, just_train_classifier = True) test_loss, test_acc, test_apcer, test_bpcer, test_eer, test_bpcer_apcer1, test_bpcer_apcer5, test_bpcer_apcer10, test_apcer1, test_apcer5, test_apcer10 = test_model(netD, test_loader, DEVICE) results.append((test_loss.item(), test_acc, test_apcer, test_bpcer, test_eer, test_bpcer_apcer1, test_bpcer_apcer5, test_bpcer_apcer10, test_apcer1, test_apcer5, test_apcer10)) #PRINTS ------------------------------------------------------------------------------------- # Compute average and std acc_array = np.array([i[1] for i in results]) apcer_array = np.array([i[2] for i in results]) bpcer_array = np.array([i[3] for i in results])
import time from options.train_options import TrainOptions from data import get_data_loaders from models import create_model from util.visualizer import Visualizer if __name__ == '__main__': opt = TrainOptions().parse() visualizer = Visualizer(opt) logger = visualizer.logger data_loaders = get_data_loaders(opt, modes=['train', 'val']) dataset = data_loaders['train'] dataset_size = len(dataset) fixed_real_imgs = next(iter(data_loaders['val'])) model = create_model(opt) model.setup(opt) total_steps = 0 for epoch in range(opt.epoch_count, opt.niter + opt.niter_decay + 1): epoch_start_time = time.time() iter_data_time = time.time() epoch_iter = 0 for i, data in enumerate(dataset): iter_start_time = time.time() total_steps += opt.batch_size epoch_iter += opt.batch_size model.set_input(data)
args.distributed = (args.local_rank != -1) print("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel print('Loading model from checkpoint {}'.format(args.model_checkpoint)) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) print("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer) num_correct = 0.0 num_examples = 0.0 for i, batch in tqdm(enumerate(val_loader), total=len(val_loader)): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch # print(tokenizer.decode(input_ids[0, -1, :].tolist())) # if we dont send labels to model, it doesnt return losses lm_logits, mc_logits, *_ = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
default=1e-3) parser.add_argument( '--num-classes', type=int, default=12) parser.add_argument( '--notify', type=int, default=100) args = parser.parse_args() cuda = torch.cuda.is_available() set_seed(seed=1, cuda=cuda) # Data train_loader, train_dataset, test_loader, test_dataset = get_data_loaders( batch_size=args.batch_size) # Model model, loss_fn, optimizer = get_model( num_classes=args.num_classes, learning_rate=args.learning_rate, cuda=cuda) start_epoch, best_accuracy = load_model(model, cuda) for epoch in range(start_epoch, args.epochs): train_model(model=model, optimizer=optimizer, train_loader=train_loader, train_dataset=train_dataset, loss_fn=loss_fn, num_epochs=args.epochs,
def train(args: CommandlineArgs, train_dataset, valid_dataset, test_dataset, writer, model: CompModel = None): # Init train_cfg = args.train best_metrics = {} epoch = -1 start_epoch = 0 device = args.device if len(writer.df) > 0: start_epoch = writer.df.index.max() # Get pytorch data loaders test_loader, train_loader, valid_loader = get_data_loaders( train_dataset, valid_dataset, test_dataset, train_cfg.batch_size, train_cfg.num_workers, test_batchsize=train_cfg.test_batchsize, shuffle_eval_set=train_cfg.shuffle_eval_set) if model is None: model: CompModel = get_model(args, train_dataset) best_model = clone_model(model) ## NOTE: # y1 refer to object labels # y2 refer to attribute labels num_classes1 = train_dataset.num_objs num_classes2 = train_dataset.num_attrs class NLLLossFuncs(NamedTuple): y1: nn.NLLLoss y2: nn.NLLLoss nll_loss_funcs = NLLLossFuncs(y1=nn.NLLLoss(), y2=nn.NLLLoss()) if train_cfg.balanced_loss: nll_loss_funcs = NLLLossFuncs( y1=nn.NLLLoss(weight=to_torch(1 / train_dataset.y1_freqs, device)), y2=nn.NLLLoss(weight=to_torch(1 / train_dataset.y2_freqs, device))) itr_per_epoch = len(train_loader) n_epochs = train_cfg.n_iter // itr_per_epoch best_primary_metric = np.inf * ( 2 * (train_cfg.primary_early_stop_metric.polarity == 'min') - 1) optimizer = get_optimizer(train_cfg.optimizer_name, train_cfg.lr, train_cfg.weight_decay, model, args) epoch_range = range(start_epoch + 1, start_epoch + n_epochs + 1) data_iterator = iter(train_loader) for epoch in epoch_range: with profileblock(label='Epoch train step'): # Select which tensors to log. Taking an average on all batches per epoch. logger = batch_torch_logger( num_batches=len(train_loader), cs_str_args='y1_loss, y2_loss, y_loss, ' 'L_rep, ' 'y1_acc, y2_acc, ' 'HSIC_cond1, HSIC_cond2, ' 'pairwise_dist_cond1_repr1, ' 'pairwise_dist_cond1_repr2, ' 'pairwise_dist_cond2_repr1, ' 'pairwise_dist_cond2_repr2, ' 'HSIC_label_cond1, HSIC_label_cond2', nanmean_args_cs_str='pairwise_dist_cond1_repr1, ' 'pairwise_dist_cond1_repr2, ' 'pairwise_dist_cond2_repr1, ' 'pairwise_dist_cond2_repr2, ' 'tloss_a, tloss_o, tloss_g_imgfeat, ' 'loss_inv_core, loss_inv_g_hidden, loss_inv_g_imgfeat', device=device) for batch_cnt in range(len(train_loader)): logger.new_batch() optimizer.zero_grad() with ns_profiling_label('fetch batch'): try: batch = next(data_iterator) except StopIteration: data_iterator = iter(train_loader) batch = next(data_iterator) with ns_profiling_label('send to gpu'): X, y2, y1 = batch[0], batch[1], batch[2] neg_attrs, neg_objs = batch[3].to(device), batch[4].to( device) X = X.float().to(device) # images y1 = y1.long().to(device) # object labels y2 = y2.long().to(device) # attribute labels with ns_profiling_label('forward pass'): # y1_scores, y2_scores are logits of negative-squared-distances at the embedding space # repr1, repr2 are phi_hat1, phi_hat2 at the paper y1_scores, y2_scores, repr1, repr2, _ = \ model(X, freeze_class1=train_cfg.freeze_class1, freeze_class2=train_cfg.freeze_class2) y1_loss = nll_loss_funcs.y1(y1_scores, y1) y2_loss = nll_loss_funcs.y2(y2_scores, y2) y_loss = y1_loss * train_cfg.Y12_balance_coeff + y2_loss * ( 1 - train_cfg.Y12_balance_coeff) L_data = train_cfg.lambda_CE * y_loss L_invert = 0. if not args.model.VisProd: # pair embedding losses tloss_g_hidden, tloss_g_imgfeat, loss_inv_core, loss_inv_g_hidden, loss_inv_g_imgfeat = \ model.eval_pair_embed_losses(args, X, model.last_feature_common, y2, y1, neg_attrs, neg_objs, nll_loss_funcs) # aggregate triplet loss into L_data L_data += train_cfg.lambda_ao_emb * tloss_g_hidden L_data += train_cfg.lambda_feat * tloss_g_imgfeat # aggregate components of L_invert L_invert += train_cfg.lambda_aux_disjoint * loss_inv_core L_invert += train_cfg.lambda_aux * loss_inv_g_hidden L_invert += train_cfg.lambda_aux_img * loss_inv_g_imgfeat ys = (y1, y2) L_rep, HSIC_rep_loss_terms, HSIC_mean_of_median_pairwise_dist_terms = \ conditional_indep_losses(repr1, repr2, ys, train_cfg.HSIC_coeff, indep_coeff2=train_cfg.HSIC_coeff, num_classes1=num_classes1, num_classes2=num_classes2, log_median_pairwise_distance=False, device=device) ohy1 = one_hot(y1, num_classes1) ohy2 = one_hot(y2, num_classes2) L_oh1, HSIC_oh_loss_terms1, _ = \ conditional_indep_losses(ohy2, repr1, ys, train_cfg.alphaH, indep_coeff2=0, num_classes1=num_classes1, num_classes2=num_classes2, log_median_pairwise_distance=False, device=device) L_oh2, HSIC_oh_loss_terms2, _ = \ conditional_indep_losses(ohy1, repr2, ys, 0, indep_coeff2=train_cfg.alphaH, num_classes1=num_classes1, num_classes2=num_classes2, log_median_pairwise_distance=False, device=device) L_indep = L_rep + L_oh1 + L_oh2 loss = L_data + L_indep + L_invert with ns_profiling_label('loss and update'): loss.backward() optimizer.step() # log the metrics with ns_profiling_label('log batch'): # extract indep loss terms from lists for logging HSIC_cond1, HSIC_cond2, pairwise_dist_cond1_repr1, pairwise_dist_cond1_repr2, \ pairwise_dist_cond2_repr1, pairwise_dist_cond2_repr2 = \ HSIC_logging_terms(HSIC_rep_loss_terms, HSIC_mean_of_median_pairwise_dist_terms) HSIC_label_cond1 = HSIC_oh_loss_terms1[0] HSIC_label_cond2 = HSIC_oh_loss_terms2[1] with ns_profiling_label('calc y1 train acc'): y1_acc = acc_from_logits(y1_scores, y1, return_tensor=True).detach() with ns_profiling_label('calc y2 train acc'): y2_acc = acc_from_logits(y2_scores, y2, return_tensor=True).detach() logger.log(locals_dict=locals()) curr_epoch_metrics = OrderedDict() curr_epoch_metrics.update(logger.get_means()) with profileblock(label='Evaluation step'): best_primary_metric, is_best = evaluation_step( model, valid_loader, test_loader, nll_loss_funcs, writer, epoch, n_epochs, curr_epoch_metrics, early_stop_metric_name=train_cfg.primary_early_stop_metric, best_ES_metric_value=best_primary_metric, calc_AUC=train_cfg.metrics.calc_AUC) # write current epoch metrics to metrics logger with ns_profiling_label('write eval step metrics'): for metric_key, value in curr_epoch_metrics.items(): writer.add_scalar(f'{metric_key}', value, epoch) # dump collected metrics to csv writer.dump_to_csv() # print all columns last_results_as_string = writer.last_results_as_string() last_results_as_string = '\n '.join( last_results_as_string.split('\n')) if train_cfg.verbose: print('\n[%d/%d]' % (epoch, n_epochs), last_results_as_string) if is_best: best_model = clone_model(model) best_metrics = writer.df.iloc[-1, :].to_dict() best_metrics['epoch'] = int(writer.df.iloc[-1, :].name) if train_cfg.verbose: print(f'Best! (@epoch {epoch})') model = best_model model.eval() print('Best epoch was: ', best_metrics['epoch']) print( f'Primary early stop monitor was {train_cfg.primary_early_stop_metric}' ) val_metrics = eval_model_with_dataloader(model, valid_loader, nll_loss_funcs, phase_name='valid') best_metrics.update(val_metrics) print('Val metrics on best val epoch :') pprint([(k, v) for k, v in val_metrics.items()]) test_metrics = eval_model_with_dataloader(model, test_loader, nll_loss_funcs, phase_name='test') best_metrics.update(test_metrics) print('\n\nTest metrics on best val epoch :') pprint([(k, v) for k, v in test_metrics.items()]) # cast numpy items to their original type for k, v in best_metrics.items(): if isinstance(v, np.number): best_metrics[k] = v.item() #### two redundant calls to align random-number-generator with original training script # check: (to delete?) _ = eval_model_with_dataloader(model, valid_loader, nll_loss_funcs, phase_name='valid') _ = eval_model_with_dataloader(model, test_loader, nll_loss_funcs, phase_name='test') return model, best_metrics
# num_epoch=16 # lr = 0.01 # using_vae = True # vis_mode = 'wandb' # dataset = 'tinyImgNet' # param_path = 'models/' # exp_name = 'vae' if __name__ == '__main__': ############ ## Data ## ############ args = parser.parse_args() os.makedirs(args.param_path, exist_ok=True) tr_loader, va_loader = get_data_loaders(args.batch_size, args.dataset, args.img_size) ############# ## Model ## ############# model = get_model(using_vae=args.using_vae).to(device) # model.decoder.net.backbone.requires_grad = False # model.decoder.net.backbone.eval() optim = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=0.9, weight_decay=1e-4) sched = LinearWarmupScheduler(optim, 1000)