def validate(params, save=False, adversarial=False, adversarial_attack=None, whitebox=True, adversary_model=None, adversary_criterion=None): ''' Performs validation of params['model'] using params['val_dataloader']. Keyword arguments: > params (dict) -- current state parameters > save (bool) -- whether to save val accuracies. Should only be `True` when called from train loop! > adversarial (bool) -- whether to test adversarially. > adversarial_attack (string) -- name of adversarial attack. > whitebox (bool) -- whether to use a whitebox attack. > adversary_model (torch.nn.Module) -- pre-trained model to generate black-box attacks with. > adversary_criterion (torch.nn.[loss]) -- loss func for the black-box "imitator" model. Returns: N/A ''' if params['model'] is None: print( 'No model loaded! Type -n to create a new model, or -l to load an existing one from file.\n' ) return # Sets up training statistics to be logged to console (and possibly file -- TODO -- ?) output. extension = 'adversarial' if adversarial else 'non-adversarial' print(color.PURPLE + '\n--- BEGIN (' + extension + ') VALIDATION PASS ---' + color.END) batch_time = train_utils.AverageMeter('Time', ':5.3f') losses = train_utils.AverageMeter('Loss', ':.4e') if params['is_generator']: progress = train_utils.ProgressMeter(len(params['val_dataloader']), batch_time, losses, prefix='Test: ') else: top1 = train_utils.AverageMeter('Acc@1', ':5.2f') progress = train_utils.ProgressMeter(len(params['val_dataloader']), batch_time, losses, top1, prefix='Test: ') # Switch model to evaluate mode; push to GPU params['model'].eval() setup_cuda(params) end = time.time() for i, (data, target) in enumerate(params['val_dataloader']): # Pushes data to GPU data = data.to(params['device']) target = target.to(params['device']) # Generate adversarial attack (default whitebox mode) if adversarial: if whitebox: data = adversary.attack_batch(data, target, params['model'], params['criterion'], attack_name=adversarial_attack, device=params['device'], epsilon=0.3, alpha=0.05) else: data = adversary.attack_batch(data, target, adversary_model, adversary_criterion, attack_name=adversarial_attack, device=params['device'], epsilon=0.3, alpha=0.05) with torch.no_grad(): # compute output output = params['model'](data) loss = params['criterion'](output, target) losses.update(loss.item(), data.size(0)) # measure accuracy and record loss if not params['is_generator']: acc1 = accuracy(output, target)[0] top1.update(acc1[0], data.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % params['print_frequency'] == 0: # Storing validation losses/accuracies if save: params['val_losses'].append(losses.get_avg()) if not params['is_generator']: params['val_accuracies'].append(top1.get_avg()) progress.print(i) # Print final accuracy/loss progress.print(len(params['val_dataloader'])) if not params['is_generator']: print(color.GREEN + ' * Acc@1 {top1.avg:.3f}'.format(top1=top1) + color.END) # Update train/val accuracy/loss plots if save: viz_utils.plot_accuracies(params) viz_utils.plot_losses(params) print(color.PURPLE + '--- END VALIDATION PASS ---\n' + color.END) if not params['is_generator']: return acc1
def run_epoch(split, model, opt, train_data, val_data=None, batch_size=100, upto=None, epoch_num=None, epochs=1, verbose=False, log_every=10, return_losses=False, table_bits=None, warmups=1000, loader=None, constant_lr=None, use_meters=True, summary_writer=None, lr_scheduler=None, custom_lr_lambda=None, label_smoothing=0.0): torch.set_grad_enabled(split == 'train') model.train() if split == 'train' else model.eval() dataset = train_data if split == 'train' else val_data losses = [] if loader is None: loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=(split == 'train')) # How many orderings to run for the same batch? nsamples = 1 if hasattr(model, 'orderings'): nsamples = len(model.orderings) if verbose: print('setting nsamples to', nsamples) dur_meter = train_utils.AverageMeter('dur', lambda v: '{:.0f}s'.format(v), display_average=False) lr_meter = train_utils.AverageMeter('lr', ':.5f', display_average=False) tups_meter = train_utils.AverageMeter('tups', utils.HumanFormat, display_average=False) loss_meter = train_utils.AverageMeter('loss (bits/tup)', ':.2f') train_throughput = train_utils.AverageMeter('tups/s', utils.HumanFormat, display_average=False) batch_time = train_utils.AverageMeter('sgd_ms', ':3.1f') data_time = train_utils.AverageMeter('data_ms', ':3.1f') progress = train_utils.ProgressMeter(upto, [ batch_time, data_time, dur_meter, lr_meter, tups_meter, train_throughput, loss_meter, ]) begin_time = t1 = time.time() for step, xb in enumerate(loader): data_time.update((time.time() - t1) * 1e3) if split == 'train': if isinstance(dataset, data.IterableDataset): # Can't call len(loader). global_steps = upto * epoch_num + step + 1 else: global_steps = len(loader) * epoch_num + step + 1 if constant_lr: lr = constant_lr for param_group in opt.param_groups: param_group['lr'] = lr elif custom_lr_lambda: lr_scheduler = None lr = custom_lr_lambda(global_steps) for param_group in opt.param_groups: param_group['lr'] = lr elif lr_scheduler is None: t = warmups if warmups < 1: # A ratio. t = int(warmups * upto * epochs) d_model = model.embed_size lr = (d_model**-0.5) * min( (global_steps**-.5), global_steps * (t**-1.5)) for param_group in opt.param_groups: param_group['lr'] = lr else: # We'll call lr_scheduler.step() below. lr = opt.param_groups[0]['lr'] if upto and step >= upto: break if isinstance(xb, list): # This happens if using data.TensorDataset. assert len(xb) == 1, xb xb = xb[0] xb = xb.float().to(train_utils.get_device(), non_blocking=True) # Forward pass, potentially through several orderings. xbhat = None model_logits = [] num_orders_to_forward = 1 if split == 'test' and nsamples > 1: # At test, we want to test the 'true' nll under all orderings. num_orders_to_forward = nsamples for i in range(num_orders_to_forward): if hasattr(model, 'update_masks'): # We want to update_masks even for first ever batch. model.update_masks() model_out = model(xb) model_logits.append(model_out) if xbhat is None: xbhat = torch.zeros_like(model_out) xbhat += model_out if num_orders_to_forward == 1: loss = model.nll(xbhat, xb, label_smoothing=label_smoothing).mean() else: # Average across orderings & then across minibatch. # # p(x) = 1/N sum_i p_i(x) # log(p(x)) = log(1/N) + log(sum_i p_i(x)) # = log(1/N) + logsumexp ( log p_i(x) ) # = log(1/N) + logsumexp ( - nll_i (x) ) # # Used only at test time. logps = [] # [batch size, num orders] assert len(model_logits) == num_orders_to_forward, len( model_logits) for logits in model_logits: # Note the minus. logps.append( -model.nll(logits, xb, label_smoothing=label_smoothing)) logps = torch.stack(logps, dim=1) logps = logps.logsumexp(dim=1) + torch.log( torch.tensor(1.0 / nsamples, device=logps.device)) loss = (-logps).mean() losses.append(loss.detach().item()) if split == 'train': opt.zero_grad() loss.backward() l2_grad_norm = TotalGradNorm(model.parameters()) opt.step() if lr_scheduler is not None: lr_scheduler.step() loss_bits = loss.item() / np.log(2) # Number of tuples processed in this epoch so far. ntuples = (step + 1) * batch_size if use_meters: dur = time.time() - begin_time lr_meter.update(lr) tups_meter.update(ntuples) loss_meter.update(loss_bits) dur_meter.update(dur) train_throughput.update(ntuples / dur) if summary_writer is not None: wandb.log({ 'train/lr': lr, 'train/tups': ntuples, 'train/tups_per_sec': ntuples / dur, 'train/nll': loss_bits, 'train/global_step': global_steps, 'train/l2_grad_norm': l2_grad_norm, }) summary_writer.add_scalar('train/lr', lr, global_step=global_steps) summary_writer.add_scalar('train/tups', ntuples, global_step=global_steps) summary_writer.add_scalar('train/tups_per_sec', ntuples / dur, global_step=global_steps) summary_writer.add_scalar('train/nll', loss_bits, global_step=global_steps) if step % log_every == 0: if table_bits: print( 'Epoch {} Iter {}, {} entropy gap {:.4f} bits (loss {:.3f}, data {:.3f}) {:.5f} lr, {} tuples seen ({} tup/s)' .format( epoch_num, step, split, loss.item() / np.log(2) - table_bits, loss.item() / np.log(2), table_bits, lr, utils.HumanFormat(ntuples), utils.HumanFormat(ntuples / (time.time() - begin_time)))) elif not use_meters: print( 'Epoch {} Iter {}, {} loss {:.3f} bits/tuple, {:.5f} lr' .format(epoch_num, step, split, loss.item() / np.log(2), lr)) if verbose: print('%s epoch average loss: %f' % (split, np.mean(losses))) batch_time.update((time.time() - t1) * 1e3) t1 = time.time() if split == 'train' and step % log_every == 0 and use_meters: progress.display(step) if return_losses: return losses return np.mean(losses)
def train_one_epoch(epoch, params, classifier_state=None): ''' Trains model given in params['model'] for a single epoch. Keyword arguments: > epoch (int) -- current training epoch > params (dict) -- current state parameters Returns: N/A ''' # Saves statistics about epoch (TODO -- pipe to file?) batch_time = train_utils.AverageMeter('Time', ':.3f') data_time = train_utils.AverageMeter('Data', ':.3f') losses = train_utils.AverageMeter('Loss', ':.3e') if params['is_generator']: progress = train_utils.ProgressMeter( len(params['train_dataloader']), batch_time, losses, prefix='Epoch: [{}]'.format(epoch)) else: top1 = train_utils.AverageMeter('Acc@1', ':4.2f') if params['adversarial_train']: top1_adv = train_utils.AverageMeter('Adv@1', ':4.2f') progress = train_utils.ProgressMeter( len(params['train_dataloader']), batch_time, losses, top1, top1_adv, prefix='Epoch: [{}]'.format(epoch)) else: progress = train_utils.ProgressMeter( len(params['train_dataloader']), batch_time, losses, top1, prefix='Epoch: [{}]'.format(epoch)) # Switch to train mode. Important for dropout and batchnorm. params['model'].train() end = time.time() for i, (data, target) in enumerate(params['train_dataloader']): # Measure data loading time data_time.update(time.time() - end) # Sends input/label tensors to GPU data = data.to(params['device']) target = target.to(params['device']) # Generate and separately perform forward pass on adversarial examples if params['adversarial_train']: if not params['is_generator']: params['model'].eval() perturbed_data = adversary.attack_batch( data, target, params['model'], params['criterion'], attack_name='FGSM', device=params['device']) perturbed_target = target.clone() params['model'].train() perturbed_output = params['model'](perturbed_data) else: # Setup perturbed_loss = 0 classifier_state['model'].eval() for epsilon in constants.ADV_VAE_EPSILONS: for attack_name in constants.ADV_VAE_ATTACKS: # Get perturbed batch perturbed_data = adversary.attack_batch( data, target, classifier_state['model'], classifier_state['criterion'], attack_name=attack_name, device=classifier_state['device'], epsilon=epsilon) clean_data = data.clone() perturbed_data, recon, mu, logvar = params['model']( perturbed_data) perturbed_output = clean_data, recon, mu, logvar perturbed_loss = perturbed_loss + params['criterion']( perturbed_output, target) # CW batch # if i % constants.CW_SPLITS == epoch % constants.CW_SPLITS: perturbed_data = adversary.attack_batch( data, target, classifier_state['model'], classifier_state['criterion'], attack_name='CW', device=classifier_state['device']) clean_data = data.clone() perturbed_data, recon, mu, logvar = params['model']( perturbed_data) perturbed_output = clean_data, recon, mu, logvar perturbed_loss = perturbed_loss + ( params['criterion'](perturbed_output, target) * len(constants.ADV_VAE_EPSILONS)) # Assume that we are not using CW, for if we do, we most certainly will not do four of them. perturbed_loss = perturbed_loss / ( len(constants.ADV_VAE_EPSILONS) * (len(constants.ADV_VAE_ATTACKS) + 1)) # Compute output output = params['model'](data) # Adversarial train uses slightly different criterion if params['adversarial_train']: if params['is_generator']: loss = params['alpha'] * params['criterion'](output, target) + \ (1 - params['alpha']) * perturbed_loss else: loss = params['alpha'] * params['criterion'](output, target) + \ (1 - params['alpha']) * params['criterion'](perturbed_output, perturbed_target) else: loss = params['criterion'](output, target) # Measure accuracy and record loss losses.update(loss.item(), data.size(0)) if not params['is_generator']: acc1 = accuracy(output, target)[0] top1.update(acc1[0], data.size(0)) if params['adversarial_train'] and not params['is_generator']: adv_acc1 = accuracy(perturbed_output, perturbed_target)[0] top1_adv.update(adv_acc1[0], perturbed_data.size(0)) # Compute gradient and do SGD step params['optimizer'].zero_grad() loss.backward() params['optimizer'].step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Prints/stores training acc/losses if i % params['print_frequency'] == 0: params['train_losses'].append(losses.get_avg()) if not params['is_generator']: params['train_accuracies'].append(top1.get_avg()) progress.print(i) # Prints final training accuracy per epoch progress.print(len(params['train_dataloader'])) # Update train/val accuracy/loss plots viz_utils.plot_accuracies(params) viz_utils.plot_losses(params) if not params['is_generator']: return acc1