def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) if args["max_steps"] > 0: t_total = args["max_steps"] args["num_train_epochs"] = ( args["max_steps"] // (len(train_dataloader) // args["gradient_accumulation_steps"]) + 1 ) else: t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args["weight_decay"], }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]}, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] # TODO: Use custom optimizer like with BertSum? optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total ) if ( args["model_name"] and os.path.isfile(os.path.join(args["model_name"], "optimizer.pt")) and os.path.isfile(os.path.join(args["model_name"], "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args["model_name"], "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args["model_name"], "scheduler.pt"))) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args["model_name"] and os.path.exists(args["model_name"]): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args["model_name"].split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args["gradient_accumulation_steps"]) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args["gradient_accumulation_steps"] ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores(**kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for current_epoch in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue # batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, } ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) if best_eval_metric and args["early_stopping_metric_minimize"]: if ( results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"] ): best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if ( results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"] ): best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args["evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"]: results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs ) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]: best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"] and args["early_stopping_consider_epochs"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info(f" Patience of {args['early_stopping_patience']} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]: best_eval_metric = results[args["early_stopping_metric"]] if args["save_best_model"]: self._save_model( args["best_model_dir"], optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args["use_early_stopping"] and args["early_stopping_consider_epochs"]: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args['early_stopping_metric']}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args['early_stopping_patience']}") else: if verbose: logger.info(f" Patience of {args['early_stopping_patience']} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values else: clip_min, clip_max = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError("Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_2", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # The optimization is performed in tanh space to keep the adversarial images bounded in correct range x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize binary search: c_current = self.initial_const * np.ones(x_batch.shape[0]) c_lower_bound = np.zeros(x_batch.shape[0]) c_double = np.ones(x_batch.shape[0]) > 0 # Initialize placeholders for best l2 distance and attack found so far best_l2dist = np.inf * np.ones(x_batch.shape[0]) best_x_adv_batch = x_batch.copy() for bss in range(self.binary_search_steps): logger.debug( "Binary search step %i out of %i (c_mean==%f)", bss, self.binary_search_steps, np.mean(c_current), ) nb_active = int(np.sum(c_current < self._c_upper_bound)) logger.debug( "Number of samples with c_current < _c_upper_bound: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() z_logits, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c_current) attack_success = loss - l2dist <= 0 overall_attack_success = attack_success for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug("Average L2Dist: %f", np.mean(l2dist)) logger.debug("Average Margin Loss: %f", np.mean(loss - l2dist)) logger.debug( "Current number of succeeded attacks: %i out of %i", int(np.sum(attack_success)), len(attack_success), ) improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] active = (c_current < self._c_upper_bound) & (learning_rate > 0) nb_active = int(np.sum(active)) logger.debug( "Number of samples with c_current < _c_upper_bound and learning_rate > 0: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], c_current[active], clip_min, clip_max, ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug( "Halving to be performed on %i samples", int(np.sum(do_halving)), ) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv1 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = x_adv1 + lr_mult * perturbation_tanh[do_halving] new_x_adv_batch = tanh_to_original(new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_halving], loss[active_and_do_halving] = self._loss( x_batch[active_and_do_halving], new_x_adv_batch, y_batch[active_and_do_halving], c_current[active_and_do_halving], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & (loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv2 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv2 + lr_mult * perturbation_tanh[do_doubling] new_x_adv_batch = tanh_to_original(new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_doubling], loss[active_and_do_doubling] = self._loss( x_batch[active_and_do_doubling], new_x_adv_batch, y_batch[active_and_do_doubling], c_current[active_and_do_doubling], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv4 = x_adv_batch_tanh[active_and_update_adv] best_lr1 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[active_and_update_adv] = x_adv4 + best_lr1 x_adv6 = x_adv_batch_tanh[active_and_update_adv] x_adv_batch[active_and_update_adv] = tanh_to_original(x_adv6, clip_min, clip_max) ( z_logits[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv], ) = self._loss( x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], c_current[active_and_update_adv], ) attack_success = loss - l2dist <= 0 overall_attack_success = overall_attack_success | attack_success # Update depending on attack success: improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] c_double[overall_attack_success] = False c_current[overall_attack_success] = (c_lower_bound + c_current)[overall_attack_success] / 2 c_old = c_current c_current[~overall_attack_success & c_double] *= 2 c_current1 = (c_current - c_lower_bound)[~overall_attack_success & ~c_double] c_current[~overall_attack_success & ~c_double] += c_current1 / 2 c_lower_bound[~overall_attack_success] = c_old[~overall_attack_success] x_adv[batch_index_1:batch_index_2] = best_x_adv_batch logger.info( "Success rate of C&W L_2 attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() ) self.eval_history = [] for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) epoch_iterator = tqdm(train_dataloader, desc=f"Epoch-{epoch}", disable=not self.is_local_master()) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if self.args.do_aug: if self.args.aug_type == 'span_cutoff': step_loss = self._training_step_with_span_cutoff(model, inputs, optimizer) elif self.args.aug_type == 'token_cutoff': step_loss = self._training_step_with_token_cutoff(model, inputs, optimizer) elif self.args.aug_type == 'dim_cutoff': step_loss = self._training_step_with_dim_cutoff(model, inputs, optimizer) else: raise NotImplementedError else: step_loss = self._training_step(model, inputs, optimizer) tr_loss += step_loss if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.max_grad_norm > 0: if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss print() self._log(logs) # if self.args.evaluate_during_training and self.args.save_steps % self.args.logging_steps == 0: # self.evaluate() if self.is_world_master() and self.args.evaluate_during_training and \ self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: self.evaluate_and_save_model(model, optimizer, scheduler) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.is_world_master() and self.args.evaluate_during_training: self.evaluate_and_save_model(model, optimizer, scheduler) if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info("\n\nTraining completed.\n\n") self.eval_history = sorted(self.eval_history, key=lambda x: x[0]) for x in self.eval_history: del x[-1] report_results(self.eval_header, self.eval_history, axis=self.eval_key_axis) return TrainOutput(self.global_step, tr_loss / self.global_step)
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() ) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self.training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss self.log(logs) if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: self.evaluate() if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug or self.args.debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ mask = kwargs.get("mask") y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: if self.targeted: raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)).astype( np.int32) x_adv = x.astype(ART_NUMPY_DTYPE) for _ in trange(max(1, self.nb_random_init), desc="AutoPGD - restart", disable=not self.verbose): # Determine correctly predicted samples y_pred = self.estimator.predict(x_adv) if self.targeted: sample_is_robust = np.argmax(y_pred, axis=1) != np.argmax( y, axis=1) elif not self.targeted: sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1) if np.sum(sample_is_robust) == 0: break x_robust = x_adv[sample_is_robust] y_robust = y[sample_is_robust] x_init = x[sample_is_robust] n = x_robust.shape[0] m = np.prod(x_robust.shape[1:]).item() random_perturbation = (random_sphere( n, m, self.eps, self.norm).reshape(x_robust.shape).astype(ART_NUMPY_DTYPE)) x_robust = x_robust + random_perturbation if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_robust = np.clip(x_robust, clip_min, clip_max) perturbation = projection(x_robust - x_init, self.eps, self.norm) x_robust = x_init + perturbation # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_robust.shape[0] / float(self.batch_size))), desc="AutoPGD - batch", leave=False, disable=not self.verbose, ): self.eta = 2 * self.eps_step batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_k = x_robust[batch_index_1:batch_index_2].astype( ART_NUMPY_DTYPE) x_init_batch = x_init[batch_index_1:batch_index_2].astype( ART_NUMPY_DTYPE) y_batch = y_robust[batch_index_1:batch_index_2] p_0 = 0 p_1 = 0.22 W = [p_0, p_1] while True: p_j_p_1 = W[-1] + max(W[-1] - W[-2] - 0.03, 0.06) if p_j_p_1 > 1: break W.append(p_j_p_1) W = [math.ceil(p * self.max_iter) for p in W] eta = self.eps_step self.count_condition_1 = 0 for k_iter in trange(self.max_iter, desc="AutoPGD - iteration", leave=False, disable=not self.verbose): # Get perturbation, use small scalar to avoid division by 0 tol = 10e-8 # Get gradient wrt loss; invert it if attack is targeted grad = self.estimator.loss_gradient( x_k, y_batch) * (1 - 2 * int(self.targeted)) # Apply norm bound if self.norm in [np.inf, "inf"]: grad = np.sign(grad) elif self.norm == 1: ind = tuple(range(1, len(x_k.shape))) grad = grad / (np.sum( np.abs(grad), axis=ind, keepdims=True) + tol) elif self.norm == 2: ind = tuple(range(1, len(x_k.shape))) grad = grad / (np.sqrt( np.sum(np.square(grad), axis=ind, keepdims=True)) + tol) assert x_k.shape == grad.shape perturbation = grad if mask is not None: perturbation = perturbation * ( mask.astype(ART_NUMPY_DTYPE)) # Apply perturbation and clip z_k_p_1 = x_k + eta * perturbation if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values z_k_p_1 = np.clip(z_k_p_1, clip_min, clip_max) if k_iter == 0: x_1 = z_k_p_1 perturbation = projection(x_1 - x_init_batch, self.eps, self.norm) x_1 = x_init_batch + perturbation f_0 = self.estimator.loss(x=x_k, y=y_batch, reduction="mean") f_1 = self.estimator.loss(x=x_1, y=y_batch, reduction="mean") self.eta_w_j_m_1 = eta self.f_max_w_j_m_1 = f_0 if f_1 >= f_0: self.f_max = f_1 self.x_max = x_1 self.x_max_m_1 = x_init_batch self.count_condition_1 += 1 else: self.f_max = f_0 self.x_max = x_k.copy() self.x_max_m_1 = x_init_batch # Settings for next iteration k x_k_m_1 = x_k.copy() x_k = x_1 else: perturbation = projection(z_k_p_1 - x_init_batch, self.eps, self.norm) z_k_p_1 = x_init_batch + perturbation alpha = 0.75 x_k_p_1 = x_k + alpha * (z_k_p_1 - x_k) + ( 1 - alpha) * (x_k - x_k_m_1) if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_k_p_1 = np.clip(x_k_p_1, clip_min, clip_max) perturbation = projection(x_k_p_1 - x_init_batch, self.eps, self.norm) x_k_p_1 = x_init_batch + perturbation f_k_p_1 = self.estimator.loss(x=x_k_p_1, y=y_batch, reduction="mean") if f_k_p_1 == 0.0: x_k = x_k_p_1.copy() break if (not self.targeted and f_k_p_1 > self.f_max) or ( self.targeted and f_k_p_1 < self.f_max): self.count_condition_1 += 1 self.x_max = x_k_p_1 self.x_max_m_1 = x_k self.f_max = f_k_p_1 if k_iter in W: rho = 0.75 condition_1 = self.count_condition_1 < rho * ( k_iter - W[W.index(k_iter) - 1]) condition_2 = self.eta_w_j_m_1 == eta and self.f_max_w_j_m_1 == self.f_max if condition_1 or condition_2: eta = eta / 2 x_k_m_1 = self.x_max_m_1 x_k = self.x_max else: x_k_m_1 = x_k x_k = x_k_p_1.copy() self.count_condition_1 = 0 self.eta_w_j_m_1 = eta self.f_max_w_j_m_1 = self.f_max else: x_k_m_1 = x_k x_k = x_k_p_1.copy() y_pred_adv_k = self.estimator.predict(x_k) if self.targeted: sample_is_not_robust_k = np.invert( np.argmax(y_pred_adv_k, axis=1) != np.argmax(y_batch, axis=1)) elif not self.targeted: sample_is_not_robust_k = np.invert( np.argmax(y_pred_adv_k, axis=1) == np.argmax(y_batch, axis=1)) x_robust[batch_index_1:batch_index_2][ sample_is_not_robust_k] = x_k[sample_is_not_robust_k] x_adv[sample_is_robust] = x_robust return x_adv
import numpy as np from sten import Sten from tqdm.auto import tqdm, trange from multiprocessing.pool import ThreadPool as Pool # - def create(x): st = Sten(x) pathlib.Path("./encodedArray/bit_{0}".format(x)).mkdir(parents=True, exist_ok=True) pathlib.Path("./decodedArray/bit_{0}".format(x)).mkdir(parents=True, exist_ok=True) for set1File in trange(1, int(sys.argv[1])+1): for set2File in trange(1, int(sys.argv[2])+1): name = str(set1File) + '_' + str(set2File) encImg = st.encode("./data/set1/{}.jpg".format(set1File), "./data/set2/{}.jpg".format(set2File), "./encodedArray/bit_{0}/{1}.npy".format(x, name)) decImg = st.decode("./encodedArray/bit_{0}/{1}.npy".format(x, name), "./decodedArray/bit_{0}/{1}.npy".format(x, name)) pool_size = 9 pool = Pool(pool_size) for x in trange(0, 9): pool.apply_async(create, (x,)) pool.close() pool.join()
def main(config): pprint(config) batch_size = config['batch_size'] epochs = config['epochs'] hidden_dim = config['hidden_dim'] embedding_dim = config['embed_dim'] num_layers = config['num_layers'] dropout = config['dropout'] learning_rate = config['learning_rate'] scale = config['scale'] number_of_runs = config['num_runs'] metrics_dict = {} data_dir = config['data_dir'] epsilon = config['epsilon'] for i in trange(number_of_runs): data_name = os.path.join(data_dir, f'reddit-bert.pkl') with open(data_name, 'rb') as f: df = pickle.load(f) df_train, df_test, _, __ = train_test_split( df, df['label'].tolist(), test_size=0.2, stratify=df['label'].tolist()) train_dataset = RedditDataset(df_train.label.values, df_train.enc.values) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=pad_collate_reddit, shuffle=True) test_dataset = RedditDataset(df_test.label.values, df_test.enc.values) test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=pad_collate_reddit) model = AdvRedditModel(embedding_dim, hidden_dim, num_layers, dropout, epsilon) device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) print(device) optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.03) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=epochs) early_stop_counter = 0 early_stop_limit = config['early_stop'] best_model_wts = copy.deepcopy(model.state_dict()) best_loss = np.inf for _ in trange(epochs, leave=False): loss, accuracy = train_loop(model, train_dataloader, optimizer, device, len(train_dataset), scale) if scheduler is not None: scheduler.step() if loss >= best_loss: early_stop_counter += 1 else: best_model_wts = copy.deepcopy(model.state_dict()) early_stop_counter = 0 best_loss = loss if early_stop_counter == early_stop_limit: break model.load_state_dict(best_model_wts) _, _, y_pred, y_true, conf = eval_loop(model, test_dataloader, device, len(test_dataset), scale) m = gr_metrics(y_pred, y_true) if 'Precision' in metrics_dict: metrics_dict['Precision'].append(m[0]) metrics_dict['Recall'].append(m[1]) metrics_dict['FScore'].append(m[2]) metrics_dict['OE'].append(m[3]) metrics_dict['all'].append([y_pred, y_true]) else: metrics_dict['Precision'] = [m[0]] metrics_dict['Recall'] = [m[1]] metrics_dict['FScore'] = [m[2]] metrics_dict['OE'] = [m[3]] metrics_dict['all'] = [[y_pred, y_true]] df = pd.DataFrame(metrics_dict) df.to_csv(f'{datetime.now().__format__("%d%m%y_%H%M%S")}_df.csv') return df['FScore'].median()
def generate( # pylint: disable=W0221 self, x: np.ndarray, y: Optional[np.ndarray] = None, target_label: Optional[Union[int, List[int], np.ndarray]] = None, **kwargs) -> np.ndarray: """ Generate DPatch. :param x: Sample images. :param y: Target labels for object detector. :param target_label: The target label of the DPatch attack. :param mask: An boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x` (N, H, W) without their channel dimensions. Any features for which the mask is True can be the center location of the patch during sampling. :type mask: `np.ndarray` :return: Adversarial patch. """ mask = kwargs.get("mask") if mask is not None: mask = mask.copy() if mask is not None and (mask.dtype != np.bool or not ( mask.shape[0] == 1 or mask.shape[0] == x.shape[0] ) or not ( (mask.shape[1] == x.shape[1] and mask.shape[2] == x.shape[2]) or (mask.shape[1] == x.shape[2] and mask.shape[2] == x.shape[3]))): raise ValueError( "The shape of `mask` has to be equal to the shape of a single samples (1, H, W) or the" "shape of `x` (N, H, W) without their channel dimensions.") channel_index = 1 if self.estimator.channels_first else x.ndim - 1 if x.shape[channel_index] != self.patch_shape[channel_index - 1]: raise ValueError( "The color channel index of the images and the patch have to be identical." ) if y is not None: raise ValueError("The DPatch attack does not use target labels.") if x.ndim != 4: raise ValueError( "The adversarial patch can only be applied to images.") if target_label is not None: if isinstance(target_label, int): self.target_label = [target_label] * x.shape[0] elif isinstance(target_label, np.ndarray): if not (target_label.shape == (x.shape[0], 1) or target_label.shape == (x.shape[0], )): raise ValueError( "The target_label has to be a 1-dimensional array.") self.target_label = target_label.tolist() else: if not len(target_label) == x.shape[0] or not isinstance( target_label, list): raise ValueError( "The target_label as list of integers needs to of length number of images in `x`." ) self.target_label = target_label patched_images, transforms = self._augment_images_with_patch( x, self._patch, random_location=True, channels_first=self.estimator.channels_first, mask=mask, transforms=None, ) patch_target: List[Dict[str, np.ndarray]] = list() if self.target_label: for i_image in range(patched_images.shape[0]): if isinstance(self.target_label, int): t_l = self.target_label else: t_l = self.target_label[i_image] i_x_1 = transforms[i_image]["i_x_1"] i_x_2 = transforms[i_image]["i_x_2"] i_y_1 = transforms[i_image]["i_y_1"] i_y_2 = transforms[i_image]["i_y_2"] target_dict = dict() target_dict["boxes"] = np.asarray( [[i_x_1, i_y_1, i_x_2, i_y_2]]) target_dict["labels"] = np.asarray([ t_l, ]) target_dict["scores"] = np.asarray([ 1.0, ]) patch_target.append(target_dict) else: predictions = self.estimator.predict(x=patched_images) for i_image in range(patched_images.shape[0]): target_dict = dict() target_dict["boxes"] = predictions[i_image]["boxes"] target_dict["labels"] = predictions[i_image]["labels"] target_dict["scores"] = predictions[i_image]["scores"] patch_target.append(target_dict) for i_step in trange(self.max_iter, desc="DPatch iteration", disable=not self.verbose): if i_step == 0 or (i_step + 1) % 100 == 0: logger.info("Training Step: %i", i_step + 1) num_batches = math.ceil(x.shape[0] / self.batch_size) patch_gradients = np.zeros_like(self._patch) for i_batch in range(num_batches): i_batch_start = i_batch * self.batch_size i_batch_end = min((i_batch + 1) * self.batch_size, patched_images.shape[0]) gradients = self.estimator.loss_gradient( x=patched_images[i_batch_start:i_batch_end], y=patch_target[i_batch_start:i_batch_end], ) for i_image in range(gradients.shape[0]): i_x_1 = transforms[i_batch_start + i_image]["i_x_1"] i_x_2 = transforms[i_batch_start + i_image]["i_x_2"] i_y_1 = transforms[i_batch_start + i_image]["i_y_1"] i_y_2 = transforms[i_batch_start + i_image]["i_y_2"] if self.estimator.channels_first: patch_gradients_i = gradients[i_image, :, i_x_1:i_x_2, i_y_1:i_y_2] else: patch_gradients_i = gradients[i_image, i_x_1:i_x_2, i_y_1:i_y_2, :] patch_gradients = patch_gradients + patch_gradients_i if self.target_label: self._patch = self._patch - np.sign( patch_gradients) * self.learning_rate else: self._patch = self._patch + np.sign( patch_gradients) * self.learning_rate if self.estimator.clip_values is not None: self._patch = np.clip( self._patch, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1], ) patched_images, _ = self._augment_images_with_patch( x, self._patch, random_location=False, channels_first=self.estimator.channels_first, mask=None, transforms=transforms, ) return self._patch
# Create worker processes print(" - Creating worker processes") ps = [ Process(target=worker, args=(inQueue, outQueue)) for _ in range(njobs) ] # Start worker processes print(" - Starting worker processes") for p in ps: p.start() # Fill the queue print(" - Filling up the queue") for i in trange(ncases): inQueue.put((i)) # Now running the processes print(" - Running the processes") output = [outQueue.get() for _ in trange(ncases)] # Send stop signal to stop iteration for _ in range(njobs): inQueue.put('STOP') # Stop processes print(" - Stopping processes") for p in ps: p.join()
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) preds = self.estimator.predict(x_adv, batch_size=self.batch_size) if (preds < 0.0).any() or (preds > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1] as output." "Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_rescaled = self._rescale(preds) # Rescaling needs more testing preds_rescaled = preds # Compute perturbation with implicit batching for batch_id in trange(int( np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="VAT", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] batch = batch.reshape((batch.shape[0], -1)) # Main algorithm for each batch var_d = np.random.randn(*batch.shape).astype(ART_NUMPY_DTYPE) # Main loop of the algorithm for _ in range(self.max_iter): var_d = self._normalize(var_d) preds_new = self.estimator.predict( (batch + var_d).reshape((-1, ) + self.estimator.input_shape)) if (preds_new < 0.0).any() or (preds_new > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1] as " "output. Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_new_rescaled = self._rescale(preds_new) # Rescaling needs more testing preds_new_rescaled = preds_new from scipy.stats import entropy kl_div1 = entropy( np.transpose(preds_rescaled[batch_index_1:batch_index_2]), np.transpose(preds_new_rescaled), ) var_d_new = np.zeros(var_d.shape).astype(ART_NUMPY_DTYPE) for current_index in range(var_d.shape[1]): var_d[:, current_index] += self.finite_diff preds_new = self.estimator.predict( (batch + var_d).reshape((-1, ) + self.estimator.input_shape)) if (preds_new < 0.0).any() or (preds_new > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1]" "as output. Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_new_rescaled = self._rescale(preds_new) # Rescaling needs more testing preds_new_rescaled = preds_new kl_div2 = entropy( np.transpose( preds_rescaled[batch_index_1:batch_index_2]), np.transpose(preds_new_rescaled), ) var_d_new[:, current_index] = (kl_div2 - kl_div1) / self.finite_diff var_d[:, current_index] -= self.finite_diff var_d = var_d_new # Apply perturbation and clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_adv[batch_index_1:batch_index_2] = np.clip( batch + self.eps * self._normalize(var_d), clip_min, clip_max).reshape((-1, ) + self.estimator.input_shape) else: x_adv[batch_index_1:batch_index_2] = ( batch + self.eps * self._normalize(var_d) ).reshape((-1, ) + self.estimator.input_shape) logger.info( "Success rate of virtual adversarial attack: %.2f%%", 100 * compute_success( self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
int(steps_per_epoch * args.num_epochs * 5 / 6) ], values=[ args.initial_lr, args.initial_lr * 0.1, args.initial_lr * 0.01 ]) optimizer = tf.keras.optimizers.SGD(learning_rate=lr_fn, momentum=args.momentum) train_log_dir = 'logs/train' val_log_dir = 'logs/val' train_summary_writer = tf.summary.create_file_writer(train_log_dir) val_summary_writer = tf.summary.create_file_writer(val_log_dir) for epoch in trange(args.num_epochs, desc='Epoch'): avg_loss = 0.0 avg_conf_loss = 0.0 avg_loc_loss = 0.0 start = time.time() for i, (_, imgs, gt_confs, gt_locs) in tqdm(enumerate(batch_generator), desc='Steps', total=steps_per_epoch): loss, conf_loss, loc_loss, l2_loss = train_step( imgs, gt_confs, gt_locs, ssd, criterion, optimizer) avg_loss = (avg_loss * i + loss.numpy()) / (i + 1) avg_conf_loss = (avg_conf_loss * i + conf_loss.numpy()) / (i + 1) avg_loc_loss = (avg_loc_loss * i + loc_loss.numpy()) / (i + 1) if (i + 1) % 10 == 0: tqdm.write(
) anchors_total = sum( not (len(a["wikidata_ids"]) == 0 and a["wikidata_src"] == "simple") for page in wiki.values() for a in page["anchors"] ) logging.info( "LANG: {} -- Solved {:.2%} of anchors".format( lang, anchors_solved / anchors_total ) ) elif args.step == "prepare": for lang in args.langs.split("|"): results = {} for rank in trange(32): filename = os.path.join( args.base_wikipedia, "{}".format(lang), "{}wiki{}.pkl".format(lang, rank), ) if os.path.exists(filename): logging.info("Loading {}".format(filename)) with open(filename, "rb") as f: for k, v in pickle.load(f).items(): results[k] = v filename = os.path.join( args.base_wikipedia, "{}".format(lang), "{}wiki.pkl".format(lang),
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ if len(x.shape) < 3: # pragma: no cover raise ValueError( "Frame saliency attack works only on inputs of dimension greater than 2." ) if self.frame_index >= len(x.shape): # pragma: no cover raise ValueError( "Frame index is out of bounds for the given input shape.") if y is not None: y = check_and_transform_label_format( y, nb_classes=self.estimator.nb_classes) if self.method == "one_shot": if y is None: return self.attacker.generate(x) return self.attacker.generate(x, y) if y is None: # Throw error if attack is targeted, but no targets are provided if hasattr( self.attacker, "targeted" ) and self.attacker.targeted: # type: ignore # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs targets = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) else: targets = y if self.estimator.nb_classes == 2 and targets.shape[ 1] == 1: # pragma: no cover raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) nb_samples = x.shape[0] nb_frames = x.shape[self.frame_index] x_adv = x.astype(ART_NUMPY_DTYPE) # Determine for which adversarial examples the attack fails: attack_failure = self._compute_attack_failure_array(x, targets, x_adv) # Determine the order in which to perturb frames, based on saliency scores: frames_to_perturb = self._compute_frames_to_perturb(x_adv, targets) # Generate adversarial perturbations. If the method is "iterative_saliency_refresh", we will use a mask so that # only the next frame to be perturbed is considered in the attack; moreover we keep track of the next frames to # be perturbed so they will not be perturbed again later on. mask = np.ones(x.shape) if self.method == "iterative_saliency_refresh": mask = np.zeros(x.shape) mask = np.swapaxes(mask, 1, self.frame_index) mask[:, frames_to_perturb[:, 0], ::] = 1 mask = np.swapaxes(mask, 1, self.frame_index) disregard = np.zeros((nb_samples, nb_frames)) disregard[:, frames_to_perturb[:, 0]] = np.inf x_adv_new = self.attacker.generate(x, targets, mask=mask) # Here starts the main iteration: for i in trange(nb_frames, desc="Frame saliency", disable=not self.verbose): # Check if attack has already succeeded for all inputs: if sum(attack_failure) == 0: break # Update designated frames with adversarial perturbations: x_adv = np.swapaxes(x_adv, 1, self.frame_index) x_adv_new = np.swapaxes(x_adv_new, 1, self.frame_index) x_adv[attack_failure, frames_to_perturb[:, i][attack_failure], ::] = x_adv_new[ attack_failure, frames_to_perturb[:, i][attack_failure], ::] x_adv = np.swapaxes(x_adv, 1, self.frame_index) x_adv_new = np.swapaxes(x_adv_new, 1, self.frame_index) # Update for which adversarial examples the attack still fails: attack_failure = self._compute_attack_failure_array( x, targets, x_adv) # For the "refresh" method, update the next frames to be perturbed (disregarding the frames that were # perturbed already) and also refresh the adversarial perturbations: if self.method == "iterative_saliency_refresh" and i < nb_frames - 1: frames_to_perturb = self._compute_frames_to_perturb( x_adv, targets, disregard) mask = np.zeros(x.shape) mask = np.swapaxes(mask, 1, self.frame_index) mask[:, frames_to_perturb[:, i + 1], ::] = 1 mask = np.swapaxes(mask, 1, self.frame_index) disregard[:, frames_to_perturb[:, i + 1]] = np.inf x_adv_new = self.attacker.generate(x_adv, targets, mask=mask) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial examples and return them as an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=False) x_adv = x.copy() for index in trange(x_adv.shape[0], desc="Decision tree attack", disable=not self.verbose): path = self.estimator.get_decision_path(x_adv[index]) legitimate_class = int( np.argmax(self.estimator.predict(x_adv[index].reshape(1, -1)))) position = -2 adv_path = [-1] ancestor = path[position] while np.abs(position) < (len(path) - 1) or adv_path[0] == -1: ancestor = path[position] current_child = path[position + 1] # search in right subtree if current_child == self.estimator.get_left_child(ancestor): if y is None: adv_path = self._df_subtree( self.estimator.get_right_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_right_child(ancestor), legitimate_class, y[index], ) else: # search in left subtree if y is None: adv_path = self._df_subtree( self.estimator.get_left_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_left_child(ancestor), legitimate_class, y[index], ) position = position - 1 # we are going the decision path upwards adv_path.append(ancestor) # we figured out which is the way to the target, now perturb # first one is leaf-> no threshold, cannot be perturbed for i in range(1, 1 + len(adv_path[1:])): go_for = adv_path[i - 1] threshold = self.estimator.get_threshold_at_node(adv_path[i]) feature = self.estimator.get_feature_at_node(adv_path[i]) # only perturb if the feature is actually wrong if x_adv[index][ feature] > threshold and go_for == self.estimator.get_left_child( adv_path[i]): x_adv[index][feature] = threshold - self.offset elif x_adv[index][ feature] <= threshold and go_for == self.estimator.get_right_child( adv_path[i]): x_adv[index][feature] = threshold + self.offset return x_adv
def _fit_cdr(self): import tensorflow as tf from .model import Model n_users = self.train_set.num_users n_items = self.train_set.num_items text_feature = self.train_set.item_text.batch_bow( np.arange(n_items)) # bag of word feature text_feature = (text_feature - text_feature.min()) / ( text_feature.max() - text_feature.min()) # normalization # Build model layer_sizes = ([self.vocab_size] + self.autoencoder_structure + [self.k] + self.autoencoder_structure + [self.vocab_size]) tf.set_random_seed(self.seed) model = Model( n_users=n_users, n_items=n_items, n_vocab=self.vocab_size, k=self.k, layers=layer_sizes, lambda_u=self.lambda_u, lambda_v=self.lambda_v, lambda_w=self.lambda_w, lambda_n=self.lambda_n, lr=self.learning_rate, dropout_rate=self.dropout_rate, U=self.U, V=self.V, act_fn=self.act_fn, seed=self.seed, ) # Training model config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) loop = trange(self.max_iter, disable=not self.verbose) for _ in loop: corruption_mask = self.rng.binomial(1, 1 - self.corruption_rate, (n_items, self.vocab_size)) sum_loss = 0 count = 0 batch_count = 0 for batch_u, batch_i, batch_j in self.train_set.uij_iter( batch_size=self.batch_size, shuffle=True): feed_dict = { model.mask_input: corruption_mask[batch_i, :], model.text_input: text_feature[batch_i, :], model.batch_u: batch_u, model.batch_i: batch_i, model.batch_j: batch_j, } sess.run(model.opt1, feed_dict) # train U, V _, _loss = sess.run([model.opt2, model.loss], feed_dict) # train SDAE sum_loss += _loss count += len(batch_u) batch_count += 1 if batch_count % 10 == 0: loop.set_postfix(loss=(sum_loss / count)) self.U, self.V = sess.run([model.U, model.V]) tf.reset_default_graph() if self.verbose: print("\nLearning completed")
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) # Check that `y` is provided for targeted attacks if self.targeted and y is None: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[ 1] == 1: # pragma: no cover raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in trange(nb_batches, desc="ZOO", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def predict_gender(audios, intervals, complex): step_seconds = 0.04 model_path = 'model/weights/max_pooling__n_layers=7__n_filters=64__downsampling=1__n_seconds=3.torch' model_type = model_path.split('/')[-1].split('__')[0] model_name = model_path.split('/')[-1].split('.')[0] model_params = { i.split('=')[0]: float(i.split('=')[1]) for i in model_name.split('__')[1:] } # Here we assume that the model was trained on the LibriSpeech dataset model_sampling_rate = LIBRISPEECH_SAMPLING_RATE / model_params[ 'downsampling'] model_num_samples = int(model_params['n_seconds'] * model_sampling_rate) if model_type == 'max_pooling': model = ConvNet(int(model_params['n_filters']), int(model_params['n_layers'])) elif model_type == 'dilated': model = DilatedNet(int(model_params['n_filters']), int(model_params['n_depth']), int(model_params['n_stacks'])) else: raise (ValueError, 'Model type not recognised.') model.load_state_dict(torch.load(model_path)) model.double() model.cuda() model.eval() for i in trange(len(audios), desc="speakers"): speaker = audios[i].replace('.wav', '') ############## # Load audio # ############## audio_path = PATH + '/raw/voc/simple_audio/' + audios[i] audio, audio_sampling_rate = sf.read(audio_path) audio_duration_seconds = audio.shape[0] * 1. / audio_sampling_rate audio_duration_minutes = audio_duration_seconds / 60. step_samples = int(step_seconds * model_sampling_rate) step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate) default_shape = None batch = [] start_min = [] pred = [] mean_pitch = [] max_pitch = [] min_pitch = [] num_zeros = [] std_pitch = [] pitch_measurements = [] for j in trange(len(intervals[speaker]), desc="intervals", leave=False): start = float(intervals[speaker][j][0]) end = float(intervals[speaker][j][1]) start_samples = int(audio_sampling_rate * start) end_samples = int(audio_sampling_rate * end) step_samples = int(step_seconds * model_sampling_rate) step_samples_at_audio_rate = int(step_seconds * audio_sampling_rate) default_shape = None for lower in tqdm(range(start_samples, end_samples, step_samples_at_audio_rate), desc="predictions", leave=False): x = audio[lower:lower + (3 * audio_sampling_rate)] if x.shape[0] != 3 * audio_sampling_rate: break sf.write(PATH + '/raw/clips/{}.wav'.format(speaker), x, audio_sampling_rate) sound = parselmouth.Sound(PATH + '/raw/clips/{}.wav'.format(speaker)) pitch = sound.to_pitch() pitch_values = pitch.selected_array['frequency'] if pitch_values[pitch_values != 0].size != 0: mean_pitch.append(np.mean(pitch_values[pitch_values != 0])) std_pitch.append(np.std(pitch_values[pitch_values != 0])) min_pitch.append(np.amin(pitch_values[pitch_values != 0])) max_pitch.append(np.amax(pitch_values[pitch_values != 0])) num_zeros.append(pitch_values[pitch_values == 0].size) pitch_measurements.append( pitch_values[pitch_values != 0].size) start_min.append(lower / 44100.) else: mean_pitch.append(0) std_pitch.append(0) min_pitch.append(0) max_pitch.append(0) num_zeros.append(pitch_values[pitch_values == 0].size) pitch_measurements.append(0) start_min.append(lower / 44100.) os.remove(PATH + '/raw/clips/{}.wav'.format(speaker)) x = torch.from_numpy(x).reshape(1, -1) x = whiten(x) # For me the bottleneck is this scipy resample call, increasing batch size doesn't make it any faster x = torch.from_numpy(resample(x, model_num_samples, axis=1)).reshape( (1, 1, model_num_samples)) y_hat = model(x).item() pred.append(y_hat) start_min.append(lower / 44100.) df = pd.DataFrame( data={ 'speaker': speaker, 'start_second': start_min, 'p': pred, 'mean_pitch': mean_pitch, 'max_pitch': max_pitch, 'min_pitch': min_pitch, 'num_zeros': num_zeros, 'std_pitch': std_pitch, 'pitch_measurements': pitch_measurements }) df = df.assign( # Time in seconds of the end of the prediction fragment t_end=df['start_second'] + model_params['n_seconds'] / 60, # Time in seconds of the center of the prediction fragment t_center=df['start_second'] * 60 + model_params['n_seconds'] / 2.) df.to_csv(PATH + 'analyses/results/results_for_' + speaker + '.csv', index=False)
def main(): init_output_dir(output_dir) # prepare dataset task = get_task(task_name, dataset_path) label_list = task.get_labels() label_map = {v: i for i, v in enumerate(label_list)} print("loading raw data ... ") train_examples = task.get_train_examples() val_examples = task.get_dev_examples() test_examples = task.get_test_examples() print("converting to data loader ... ") train_loader = get_dataloader(train_examples, label_map) val_loader = get_dataloader(val_examples, label_map) test_loader = get_dataloader(test_examples, label_map) # load model print("loading model ... ") model = InferSent(config) model.load_state_dict(torch.load(model_path)) model = model.cuda() if config['use_cuda'] else model model.set_w2v_path(word_emb_path) print("building model vocabs ... ") model.build_vocab_k_words(K=100000, verbose=True) # run embedding for train set print("Run embedding for train set") for _ in trange(1, desc="Epoch"): run_encoding(loader=train_loader, model=model, mode='train') print("Run embedding for dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=val_loader, model=model, mode='dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=test_loader, model=model, mode='test') # HACK FOR MNLI mis-matched if task_name == 'mnli': print("Run Embedding for MNLI Mis-Matched Datasets") print("loading raw data ... ") mm_val_example = MnliMismatchedProcessor().get_dev_examples(dataset_path) mm_test_examples = MnliMismatchedProcessor().get_test_examples(dataset_path) print("converting to data loader ... ") mm_val_loader = get_dataloader(mm_val_example, label_map) mm_test_loader = get_dataloader(mm_test_examples, label_map) print("Run embedding for mm_dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_val_loader, model=model, mode='mm_dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_test_loader, model=model, mode='mm_test')
def train( self, train_dataset, output_dir, multi_label=False, show_running_loss=True, eval_df=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args["weight_decay"], }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total ) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"]) epoch_number = 0 best_eval_loss = None early_stopping_counter = 0 if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores(multi_label, **kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, } ) if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model( eval_df, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( args["output_dir"] + "training_progress_scores.csv", index=False, ) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_loss: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) elif results["eval_loss"] - best_eval_loss < args["early_stopping_delta"]: best_eval_loss = results["eval_loss"] self._save_model(args["best_model_dir"], model=model, results=results) early_stopping_counter = 0 else: if early_stopping_counter < args["early_stopping_patience"]: early_stopping_counter += 1 if verbose: print() print(f"No improvement in eval_loss for {early_stopping_counter} steps.") print(f"Training will stop at {args['early_stopping_patience']} steps.") print() else: if verbose: print() print(f"Patience of {args['early_stopping_patience']} steps reached.") print("Training terminated.") print() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if (args["save_model_every_epoch"] or args["evaluate_during_training"]) and not os.path.exists( output_dir_current ): os.makedirs(output_dir_current) if args["save_model_every_epoch"]: self._save_model(output_dir_current, model=model) if args["evaluate_during_training"]: results, _, _ = self.eval_model( eval_df, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs ) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(args["output_dir"] + "training_progress_scores.csv", index=False) return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
writer=writer, **(sampler_kwargs or {})) if equilibrate: log.info('Equilibrating...') with tqdm(count(), desc='equilibrating', disable=None) as steps: next( sample_wf(wf, sampler.iter_with_info(), steps, equilibrate=equilibrate)) log.info('Equilibrated') log.info('Initializing training') steps = trange( init_step, n_steps, initial=init_step, total=n_steps, desc='training', disable=None, ) chkpts = chkpts if chkpts is not None else [] last_log = 0 try: for step, _ in fit_wf( wf, LossEnergy(), opt, sampler.iter_batches( batch_size=batch_size, epoch_size=epoch_size, range=partial(trange, desc='sampling',
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [p for n, p in model.named_parameters() if n in params] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps # TODO: Use custom optimizer like with BertSum? optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if ( args.model_name and os.path.isfile(os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name, "scheduler.pt"))) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: scaler = amp.GradScaler() model.train() for current_epoch in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue # batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) with amp.autocast() if args.fp16 else nullcontext(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results ) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results ) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) if args.save_eval_checkpoints: self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] if args.save_best_model: self._save_model(args.best_model_dir, optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def simulate_recon( measured_sino, ctim, scanner_params, simulate_3d=False, nitr=60, fwhm_rm=0., slice_idx=-1, randoms=None, scatter=None, mu_input=False, msk_radius=29., psf=None, ): ''' Reconstruct PET image from simulated input data using the EM-ML (2D) or OSEM (3D) algorithm. measured_sino : simulated emission data with photon attenuation ctim : either a 2D CT image or a 3D CT image from which a 2D slice is chosen (slice_idx) for estimation of the attenuation factors slice_idx : index to extract one 2D slice for this simulation if input image is 3D nitr : number of iterations used for the EM-ML reconstruction algorithm scanner_params : scanner parameters containing scanner constants and axial and transaxial look up tables (LUTs) randoms : randoms and scatter events (optional) ''' # > decompose the scanner constants and LUTs for easier access Cnt = scanner_params['Cnt'] txLUT = scanner_params['txLUT'] axLUT = scanner_params['axLUT'] psfkernel = mmrrec.psf_config(psf, Cnt) if simulate_3d: if ctim.ndim!=3 \ or ctim.shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']): raise ValueError( 'The CT/mu-map image does not match the scanner image shape.') else: # > 2D case with reduced rings if len(ctim.shape) == 3: # make sure that the shape of the input image matches the image size of the scanner if ctim.shape[1:] != (Cnt['SO_IMY'], Cnt['SO_IMX']): raise ValueError( 'The input image shape for x and y does not match the scanner image size.' ) # pick the right slice index (slice_idx) if not given or mistaken if slice_idx < 0: log.warning( 'the axial index <slice_idx> is chosen to be in the middle of axial FOV.' ) slice_idx = ctim.shape[0] / 2 if slice_idx >= ctim.shape[0]: raise ValueError( 'The axial index for 2D slice selection is outside the image.' ) elif len(ctim.shape) == 2: # make sure that the shape of the input image matches the image size of the scanner if ctim.shape != (Cnt['SO_IMY'], Cnt['SO_IMX']): raise ValueError( 'The input image shape for x and y does not match the scanner image size.' ) ctim.shape = (1, ) + ctim.shape slice_idx = 0 if 'rSZ_IMZ' not in Cnt: raise ValueError('Missing reduced axial FOV parameters.') # -------------------- if mu_input: mui = ctim else: # > get the mu-map [1/cm] from CT [HU] mui = nimpa.ct2mu(ctim) # > get rid of negative values mui[mui < 0] = 0 # -------------------- if simulate_3d: rmu = mui # > number of axial sinograms nsinos = Cnt['NSN11'] else: # -------------------- # > create a number of slides of the same chosen image slice # for reduced (fast) 3D simulation rmu = mui[slice_idx, :, :] rmu.shape = (1, ) + rmu.shape rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0) # -------------------- # > number of axial sinograms nsinos = Cnt['rNSN1'] # import pdb; pdb.set_trace() # > attenuation factor sinogram attsino = mmrprj.frwd_prj(rmu, scanner_params, attenuation=True, dev_out=True) nrmsino = np.ones(attsino.shape, dtype=np.float32) # > randoms and scatter put together if isinstance(randoms, np.ndarray) and measured_sino.shape == randoms.shape: rsng = mmraux.remgaps(randoms, txLUT, Cnt) else: rsng = 1e-5 * np.ones((Cnt['Naw'], nsinos), dtype=np.float32) if isinstance(scatter, np.ndarray) and measured_sino.shape == scatter.shape: ssng = mmraux.remgaps(scatter, txLUT, Cnt) else: ssng = 1e-5 * np.ones((Cnt['Naw'], nsinos), dtype=np.float32) # resolution modelling Cnt['SIGMA_RM'] = mmrrec.fwhm2sig(fwhm_rm, voxsize=Cnt['SZ_VOXZ'] * 10) if fwhm_rm else 0 if simulate_3d: log.debug('------ OSEM (%d) -------', nitr) # measured sinogram in GPU-enabled shape psng = mmraux.remgaps(measured_sino.astype(np.uint16), txLUT, Cnt) # > mask for reconstructed image. anything outside it is set to zero msk = mmrimg.get_cylinder( Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9 # > init image eimg = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32) # ------------------------------------ Sn = 14 # number of subsets # -get one subset to get number of projection bins in a subset Sprj, s = mmrrec.get_subsets14(0, scanner_params) Nprj = len(Sprj) # > init subset array and sensitivity image for a given subset sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32) # > init sensitivity images for each subset sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32) tmpsim = cu.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32) for n in trange(Sn, desc="sensitivity", leave=log.getEffectiveLevel() < logging.INFO): # first number of projection for the given subset sinoTIdx[n, 0] = Nprj sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params) # > sensitivity image petprj.bprj(tmpsim.cuvec, cu.asarray(attsino[sinoTIdx[n, 1:], :]).cuvec, txLUT, axLUT, sinoTIdx[n, 1:], Cnt) sim[n] = tmpsim del tmpsim # ------------------------------------- for _ in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO, leave=log.getEffectiveLevel() < logging.INFO): petprj.osem(eimg, psng, rsng, ssng, nrmsino, attsino, sinoTIdx, sim, msk, psfkernel, txLUT, axLUT, Cnt) eim = mmrimg.convert2e7(eimg, Cnt) else: def psf(x, output=None): if Cnt['SIGMA_RM']: x = ndi.gaussian_filter(x, sigma=Cnt['SIGMA_RM'], mode='constant', output=None) return x # > estimated image, initialised to ones eim = np.ones(rmu.shape, dtype=np.float32) msk = mmrimg.get_cylinder( Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=False) > 0.9 # > sensitivity image for the EM-ML reconstruction sim = mmrprj.back_prj(attsino, scanner_params) sim_inv = 1 / psf(sim) sim_inv[~msk] = 0 rndsct = rsng + ssng for _ in trange(nitr, desc="MLEM", disable=log.getEffectiveLevel() > logging.INFO, leave=log.getEffectiveLevel() < logging.INFO): # > remove gaps from the measured sinogram # > then forward project the estimated image # > after which divide the measured sinogram # by the estimated sinogram (forward projected) crrsino = ( mmraux.remgaps(measured_sino, txLUT, Cnt) / (mmrprj.frwd_prj(psf(eim), scanner_params, dev_out=True) + rndsct)) # > back project the correction factors sinogram bim = mmrprj.back_prj(crrsino, scanner_params) bim = psf(bim, output=bim) # > divide the back-projected image by the sensitivity image # > update the estimated image and remove NaNs eim *= bim * sim_inv eim[np.isnan(eim)] = 0 return eim
def generate( # pylint: disable=W0221 self, x: np.ndarray, y: Optional[np.ndarray] = None, sample_sizes: Optional[np.ndarray] = None, automatically_append: bool = True, verify_input_data: bool = True, perturb_sizes: Optional[List[List[int]]] = None, perturb_starts: Optional[List[List[int]]] = None, **kwargs, ) -> np.ndarray: """ Generates the adversarial examples. x needs to be composed of valid files by default which can support the adversarial perturbation and so are malicious and can support the assigned L0 budget. They can obtained by using `pull_out_valid_samples` on the data. This check on the input data can be over-ridden by toggling the flag verify_input_data This will result in only the data which can be made adversarial being perturbed and so the resulting batch will be a mixture of adversarial and unperturbed data. To assign the L0 budget we go through each list in perturb_sizes and perturb_starts in order, and assign the budget based on the sizes given until the l0 budget is exhausted. After all the regions marked in perturb_sizes and perturb_starts have been assigned and automatically_append is set to true and remaining l0 perturbation the extra perturbation is added at the end in an append style attack. :param x: A array with input data. :param y: (N, 1) binary labels to make sure the benign files are zero masked. :param sample_sizes: The size of the original file, before it was padded to the input size required by MalConv :param automatically_append: Whether to automatically append extra spare perturbation at the end of the file. :param verify_input_data: If to check that all the data supplied is valid for adversarial perturbations. :param perturb_sizes: A list of length batch size, each element is in itself a list containing the size of the allowable perturbation region :param perturb_starts: A list of length batch size, each element is in itself a list containing the start of perturbation region. :return x: our adversarial examples. """ import tensorflow as tf # lgtm [py/repeated-import] # make copy so original data is not modified. adv_x = x.copy() if sample_sizes is None: # pragma: no cover raise ValueError( "The size of the original files needs to be supplied") if y is None: # pragma: no cover raise ValueError( "Labels need to be provided so we only modify the malware") # check that the dimensions all match assert len(adv_x) == len(y) assert len(y) == len(sample_sizes) if perturb_sizes is not None: assert len(y) == len(perturb_sizes) if perturb_starts is not None: assert len(y) == len(perturb_starts) # check that if perturb_starts is provided perturb_sizes is also provided and vise versa if perturb_starts is not None: assert perturb_sizes is not None if perturb_sizes is not None: assert perturb_starts is not None # if we do not automatically append then make sure that we have supplied # start and end positions for the perturbation. if not automatically_append: assert perturb_sizes is not None assert perturb_starts is not None perturbation_size = np.zeros(len(sample_sizes), dtype=int) for i, sample_size in enumerate(sample_sizes): if self.l_0 < 1: # l0 is a fraction of the filesize perturbation_size[i] = int(sample_size * self.l_0) else: # or l0 is interpreted as total perturbation size perturbation_size[i] = int(self.l_0) self.total_perturbation = np.copy(perturbation_size) if perturb_sizes is not None and perturb_starts is not None: perturbation_size, perturb_sizes = self.compute_perturbation_regions( perturbation_size, perturb_sizes, automatically_append) y = self.check_valid_size(y, sample_sizes, perturbation_size) if verify_input_data: if np.sum(y) != len(y): raise ValueError( # pragma: no cover f"{len(y) - np.sum(y)} invalid samples found in batch which cannot support the assigned " f"perturbation or are benign To filter for samples that can be processed use " f"pull_out_valid_samples on the samples. Checking can be disabled by using verify_input_data" ) adv_x = self.initialise_sample(adv_x, y, sample_sizes, perturbation_size, perturb_sizes=perturb_sizes, perturb_starts=perturb_starts) mask = self.generate_mask(adv_x, y, sample_sizes, perturbation_size, perturb_sizes=perturb_sizes, perturb_starts=perturb_starts) embeddings = tf.nn.embedding_lookup(params=self.embedding_weights, ids=adv_x.astype("int32")) for _ in trange(self.num_of_iterations, desc="PE Adv. Malware", disable=not self.verbose): gradients = self.estimator.class_gradient(embeddings, label=0) # go from (bsize x 1 x features x embedding size) -> (bsize x features x embedding size) in a # framework agnostic manner. gradients = gradients[:, 0, :, :] gradients = -1 * gradients embeddings = self.update_embeddings(embeddings, gradients, mask) adv_x = self.get_adv_malware( embeddings=embeddings, data=adv_x, labels=y, fsize=sample_sizes, perturbation_size=perturbation_size, perturb_sizes=perturb_sizes, perturb_starts=perturb_starts, ) return adv_x
def train(self, train_dataset, output_dir, multi_label=False, show_running_loss=True, eval_df=None, **kwargs): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"] }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) epoch_number = 0 if args['evaluate_during_training']: extra_metrics = {key: [] for key in kwargs} if multi_label: training_progress_scores = { 'global_step': [], 'LRAP': [], 'train_loss': [], 'eval_loss': [], **extra_metrics } else: if self.model.num_labels == 2: training_progress_scores = { 'global_step': [], 'tp': [], 'tn': [], 'fp': [], 'fn': [], 'mcc': [], 'train_loss': [], 'eval_loss': [], **extra_metrics } else: training_progress_scores = { 'global_step': [], 'mcc': [], 'train_loss': [], 'eval_loss': [], **extra_metrics } if args['wandb_project']: wandb.init(project=args['wandb_project'], config={**args}) wandb.watch(self.model) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args['n_gpu'] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args['wandb_project']: wandb.log({ 'Training loss': current_loss, 'lr': scheduler.get_lr()[0], 'global_step': global_step }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) if args['evaluate_during_training'] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) if args['save_eval_checkpoints']: model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) output_eval_file = os.path.join( output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) training_progress_scores['global_step'].append( global_step) training_progress_scores['train_loss'].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(args['output_dir'] + 'training_progress_scores.csv', index=False) if args['wandb_project']: wandb.log( self._get_last_metrics( training_progress_scores)) epoch_number += 1 output_dir_current = os.path.join(output_dir, "epoch-{}".format(epoch_number)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) if args['evaluate_during_training']: results, _, _ = self.eval_model(eval_df, verbose=True, **kwargs) output_eval_file = os.path.join(output_dir_current, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) return global_step, tr_loss / global_step
def train(self, train_dataset, output_dir, show_running_loss=True, eval_data=None): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"] }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args['silent']) model.train() for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args['silent'])): batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'start_positions': batch[3], 'end_positions': batch[4] } if args['model_type'] != 'distilbert': inputs['token_type_ids'] = None if args[ 'model_type'] == 'xlm' else batch[2] if args['model_type'] in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args['n_gpu'] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics if args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model(eval_data, verbose=True) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir_current): os.makedirs(output_dir_current) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir_current) self.tokenizer.save_pretrained(output_dir_current) return global_step, tr_loss / global_step
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path: (Optional) Local path to model if model to train has been instantiated from a local path If present, we will try reloading the optimizer/scheduler states from there. """ train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(model_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model model.to(self.args.device) if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) if is_wandb_available(): self._setup_wandb() # Train! if is_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() ) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self._training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if self.is_local_master(): if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or ( global_step == 1 and self.args.logging_first_step ): logs = {} if self.args.evaluate_during_training: results = self.evaluate() for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss if self.tb_writer: for k, v in logs.items(): self.tb_writer.add_scalar(k, v, global_step) if is_wandb_available(): wandb.log(logs, step=global_step) epoch_iterator.write(json.dumps({**logs, **{"step": global_step}})) if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{global_step}") self.save_model(output_dir) self._rotate_checkpoints() torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if self.args.max_steps > 0 and global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(global_step, tr_loss / global_step)
def _attack( self, initial_sample: np.ndarray, original_sample: np.ndarray, y_p: int, target: int, initial_delta: float, initial_epsilon: float, clip_min: float, clip_max: float, ) -> np.ndarray: """ Main function for the boundary attack. :param initial_sample: An initial adversarial example. :param original_sample: The original input. :param y_p: The predicted label of the original input. :param target: The target label. :param initial_delta: Initial step size for the orthogonal step. :param initial_epsilon: Initial step size for the step towards the target. :param clip_min: Minimum value of an example. :param clip_max: Maximum value of an example. :return: an adversarial example. """ # Get initialization for some variables x_adv = initial_sample self.curr_delta = initial_delta self.curr_epsilon = initial_epsilon self.curr_adv = x_adv # Main loop to wander around the boundary for _ in trange(self.max_iter, desc="Boundary attack - iterations", disable=not self.verbose): # Trust region method to adjust delta for _ in range(self.num_trial): potential_advs = [] for _ in range(self.sample_size): potential_adv = x_adv + self._orthogonal_perturb( self.curr_delta, x_adv, original_sample) potential_adv = np.clip(potential_adv, clip_min, clip_max) potential_advs.append(potential_adv) preds = np.argmax( self.estimator.predict(np.array(potential_advs), batch_size=self.batch_size), axis=1, ) if self.targeted: satisfied = preds == target else: satisfied = preds != y_p delta_ratio = np.mean(satisfied) if delta_ratio < 0.2: self.curr_delta *= self.step_adapt elif delta_ratio > 0.5: self.curr_delta /= self.step_adapt if delta_ratio > 0: x_advs = np.array(potential_advs)[np.where(satisfied)[0]] break else: logger.warning("Adversarial example found but not optimal.") return x_adv # Trust region method to adjust epsilon for _ in range(self.num_trial): perturb = np.repeat( np.array([original_sample]), len(x_advs), axis=0) - x_advs perturb *= self.curr_epsilon potential_advs = x_advs + perturb potential_advs = np.clip(potential_advs, clip_min, clip_max) preds = np.argmax( self.estimator.predict(potential_advs, batch_size=self.batch_size), axis=1, ) if self.targeted: satisfied = preds == target else: satisfied = preds != y_p epsilon_ratio = np.mean(satisfied) if epsilon_ratio < 0.2: self.curr_epsilon *= self.step_adapt elif epsilon_ratio > 0.5: self.curr_epsilon /= self.step_adapt if epsilon_ratio > 0: x_adv = self._best_adv( original_sample, potential_advs[np.where(satisfied)[0]]) self.curr_adv = x_adv break else: logger.warning("Adversarial example found but not optimal.") return self._best_adv(original_sample, x_advs) if self.min_epsilon is not None and self.curr_epsilon < self.min_epsilon: return x_adv return x_adv
def train( self, train_dataset, output_dir, files_list=None, image_path=None, text_label=None, labels_label=None, images_label=None, image_type_extension=None, data_type_extension=None, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args multi_label = self.multi_label tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, num_workers=args.process_count, ) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(multi_label, **kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**args}, **args.wandb_kwargs) wandb.watch(self.model) model.train() for _ in train_iterator: train_iterator.set_description(f"Epoch {epoch_number} of {args.num_train_epochs}") for step, batch in enumerate( tqdm(train_dataloader, desc=f"Running Epoch {epoch_number}", disable=args.silent) ): batch = tuple(t.to(device) for t in batch) labels = batch[5] inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) logits = outputs[0] # Different from default behaviour loss = self.criterion(logits, labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args.max_grad_norm # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args.max_grad_norm # ) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = self.eval_model( eval_data, files_list=files_list, image_path=image_path, text_label=text_label, labels_label=labels_label, images_label=images_label, image_type_extension=image_type_extension, data_type_extension=data_type_extension, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self._save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self._save_model(output_dir_current, model=model) if args.evaluate_during_training: results, _ = self.eval_model( eval_data, files_list=files_list, image_path=image_path, text_label=text_label, labels_label=labels_label, images_label=images_label, image_type_extension=image_type_extension, data_type_extension=data_type_extension, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) self._save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self._save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping and args.early_stopping_consider_epochs: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min_per_pixel, clip_max_per_pixel = self.estimator.clip_values else: clip_min_per_pixel, clip_max_per_pixel = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError("Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_inf", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # Determine values for later clipping clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel, clip_max_per_pixel) clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel, clip_max_per_pixel) # The optimization is performed in tanh space to keep the # adversarial images bounded from clip_min and clip_max. x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() # Initialize optimization: z_logits, loss = self._loss(x_adv_batch, y_batch) attack_success = loss <= 0 learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug( "Successful attack samples: %i out of %i", int(np.sum(attack_success)), x_batch.shape[0], ) # only continue optimization for those samples where attack hasn't succeeded yet: active = ~attack_success if np.sum(active) == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], clip_min[active], clip_max[active], ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug("Halving to be performed on %i samples", int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] adv_10 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = adv_10 + lr_mult * perturbation_tanh[do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_halving], clip_max[active_and_do_halving], ) _, loss[active_and_do_halving] = self._loss(new_x_adv_batch, y_batch[active_and_do_halving]) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("Loss: %s", str(loss)) logger.debug("Prev_loss: %s", str(prev_loss)) logger.debug("Best_loss: %s", str(best_loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & (loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv15 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv15 + lr_mult * perturbation_tanh[do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_doubling], clip_max[active_and_do_doubling], ) _, loss[active_and_do_doubling] = self._loss(new_x_adv_batch, y_batch[active_and_do_doubling]) logger.debug("New Average Loss: %f", np.mean(loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] best_13 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + best_13 x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min[active_and_update_adv], clip_max[active_and_update_adv], ) (z_logits[active_and_update_adv], loss[active_and_update_adv],) = self._loss( x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], ) attack_success = loss <= 0 # Update depending on attack success: x_adv_batch[~attack_success] = x_batch[~attack_success] x_adv[batch_index_1:batch_index_2] = x_adv_batch logger.info( "Success rate of C&W L_inf attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def _sample_chain(self, rng, n_sample, init_state, chain_var_funcs, chain_index, parallel_chains, memmap_enabled, memmap_path): if not isinstance(init_state, ChainState): state = ChainState(init_state) else: state = init_state chain_stats = self._init_chain_stats( n_sample, memmap_enabled, memmap_path, chain_index) # Initialise chain variable trace arrays chains = {} for key, chain_func in chain_var_funcs.items(): var = chain_func(state) if memmap_enabled: filename = self._generate_memmap_filename( memmap_path, 'trace', key, chain_index) chains[key] = self._open_new_memmap( filename, (n_sample,) + var.shape, np.float64, np.nan) else: chains[key] = np.full((n_sample,) + var.shape, np.nan) total_return_nbytes = get_size(chain_stats) + get_size(chains) # Check if running in parallel and if total number of bytes to be # returned exceeds pickle limit if parallel_chains and total_return_nbytes > 2**31 - 1: raise RuntimeError( f'Total number of bytes allocated for arrays to be returned ' f'({total_return_nbytes / 2**30:.2f} GiB) exceeds size limit ' f'for returning results of a process (2 GiB). Try rerunning ' f'with chain memory-mapping enabled (`memmap_enabled=True`).') if TQDM_AVAILABLE: desc = ('Sampling' if chain_index is None else f'Chain {chain_index}') position = chain_index if parallel_chains else None sample_range = tqdm.trange( n_sample, desc=desc, unit='it', dynamic_ncols=True, position=position) else: sample_range = range(n_sample) try: for sample_index in sample_range: for trans_key, transition in self.transitions.items(): state, trans_stats = transition.sample(state, rng) if trans_stats is not None: if trans_key not in chain_stats: logger.warning( f'Transition {trans_key} returned statistics ' f'but has no `statistic_types` attribute.') for key, val in trans_stats.items(): if key in chain_stats[trans_key]: chain_stats[trans_key][key][sample_index] = val for key, chain_func in chain_var_funcs.items(): var = chain_func(state) chains[key][sample_index] = var except KeyboardInterrupt: if memmap_enabled: for chain in chains.values: chain.flush() for trans_stats in chain_stats.values(): for stat in trans_stats.values(): stat.flush() else: # If not interrupted increment sample_index so that it equals # n_sample to flag chain completed sampling sample_index += 1 if parallel_chains and memmap_enabled: trace_filenames = self._memmaps_to_filenames(chains) stats_filenames = self._memmaps_to_filenames(chain_stats) return trace_filenames, stats_filenames, sample_index return chains, chain_stats, sample_index