def __init__( self, model: str, custom_model: PreTrainedModel = None, custom_tokenizer: PreTrainedTokenizer = None, ): """ :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used. :param custom_model: This is optional if a custom bert model is used. :param custom_tokenizer: Place to use custom tokenizer. """ base_model, base_tokenizer = self.MODELS.get(model, (None, None)) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if custom_model: self.model = custom_model.to(self.device) else: self.model = base_model.from_pretrained( model, output_hidden_states=True).to(self.device) if custom_tokenizer: self.tokenizer = custom_tokenizer else: self.tokenizer = base_tokenizer.from_pretrained(model) self.model.eval()
def __init__(self, model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizerFast, model_name: str, config: Dict[str, Any], collect_activations_flag: Optional[bool] = False, collect_activations_layer_nums: Optional[List[int]] = None, # None --> collect for all layers verbose: Optional[bool] = True, gpu: Optional[bool] = True ): """ Creates an LM object given a model and tokenizer. Args: model: HuggingFace Transformers Pytorch language model. tokenizer: The tokenizer associated with the model model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called) config: Configuration that has the information about the layer whose activations we will collect collect_activations_flag: True if we want to collect activations collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers to track. By default this would be None and we'd collect activations for all layers. verbose: If True, model.generate() displays output tokens in HTML as they're generated. gpu: Set to False to force using the CPU even if a GPU exists. """ self.model_name = model_name self.model = model if torch.cuda.is_available() and gpu: self.model = model.to('cuda') self.device = 'cuda' if torch.cuda.is_available() \ and self.model.device.type == 'cuda' \ else 'cpu' self.tokenizer = tokenizer self.verbose = verbose self._path = os.path.dirname(ecco.__file__) # Neuron Activation self.collect_activations_flag = collect_activations_flag self.collect_activations_layer_nums = collect_activations_layer_nums # For each model, this indicates the layer whose activations # we will collect self.model_config = config try: self.model_type = self.model_config['type'] embeddings_layer_name = self.model_config['embedding'] embed_retriever = attrgetter(embeddings_layer_name) self.model_embeddings = embed_retriever(self.model) self.collect_activations_layer_name_sig = self.model_config['activations'][0] except KeyError: raise ValueError( f"The model '{self.model_name}' is not correctly configured in Ecco's 'model-config.yaml' file" ) from KeyError() assert self.model_type in ['causal', 'mlm', 'enc-dec'], f"model type {self.model_type} not found" self._reset()
def get_number_perfect_predictions(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, eval_data_file): labels_file = str(eval_data_file).replace('masked_code_', 'mask_') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # Inputs with open(eval_data_file) as f: inputs = f.readlines() inputs = [x.strip() for x in inputs] # Targets with open(labels_file) as f: targets = f.readlines() targets = [x.strip() for x in targets] n_perfect_predictions = 0 i = 0 while i < len(inputs): input = inputs[i] target = "".join(targets[i].split()).replace('<z>', '') indexed_tokens = tokenizer.encode(input) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(device) with torch.no_grad(): outputs = model(tokens_tensor) predictions = outputs[0] predicted_sentence = [] for token in torch.argmax(predictions[0], 1).cpu().numpy(): if token != tokenizer.convert_tokens_to_ids('<z>'): predicted_sentence.append(token) else: break prediction = tokenizer.decode(predicted_sentence) prediction = "".join(prediction.split()) if target == prediction: n_perfect_predictions += 1 i += 1 return n_perfect_predictions, len(inputs)
def __init__(self, model: PreTrainedModel, args: TrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, mi_estimator: Optional[CLUB] = None): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.mi_estimator = mi_estimator self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True
def predict_task_split(self, model: transformers.PreTrainedModel, inputs: tf.data.Dataset, task: Task, max_length: int = 140, min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]: try: outputs = [] model.to(self.device) for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(), desc="Predicting %s" % task, unit="batch", leave=False): with torch.no_grad(): model.eval() forward_params = self.prepare_forward_inputs(model, batch_inputs) batch_outputs = model.generate(forward_params['input_ids'], attention_mask=forward_params['attention_mask'], do_sample=False, max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2, min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1, num_beams=4, length_penalty=2., no_repeat_ngram_size=3, early_stopping=True) batch_outputs = batch_outputs.detach().cpu().numpy() outputs.extend(batch_outputs) return outputs # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal except Exception as e: if isinstance(e, tf.errors.UnknownError): logging.warning('Encountered error: %s on %s: %s', type(e), task, e) # Unfortunately, we don't get a more helpful error type, but this usually means # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server) return [] else: # We got a different exception type so let python freak out accordingly logging.error('Encountered error: %s on %s: %s', type(e), task, e) raise e
def train_model(df: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizer, model: transformers.PreTrainedModel, steps: int, batch_size: int, save_path:str) -> None: device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #device = torch.device('cpu') model.to(device) model.train() optim = AdamW(model.parameters(), lr=5e-5) losses = [] for step in trange(steps): optim.zero_grad() sample = df.sample(batch_size) X = sample['articles'].tolist() y = sample['labels'].tolist() inputs = tokenizer(X, return_tensors='pt', padding=True, truncation=True) input_ids = inputs['input_ids'].to(device) attention_mask = inputs['attention_mask'].to(device) labels = torch.tensor(y).unsqueeze(1).to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs.loss losses.append(loss) if (step + 1) % 100 == 0: print(f'Step: {step + 1} Loss: {sum(losses)/len(losses)}') send_message(f'Step: {step + 1} Loss: {sum(losses)/len(losses)}') losses = [] loss.backward() optim.step() model.save_pretrained(save_path)
def __init__( self, tokenizer: PreTrainedTokenizer, model: PreTrainedModel, device: str = "cpu", batch_size: int = 1, normalize: bool = False, ): # Load pre-trained model tokenizer (vocabulary) self.tokenizer = tokenizer self.device = torch.device(device) self.model = model.to(self.device).eval() self.batch_size = batch_size self.normalize = normalize self._loss_fn = CrossEntropyLoss(ignore_index=-1)
def __init__( self, model: PreTrainedModel, args: TrainingArguments, device, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(device) self.args = args self.device = device if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator() self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) self._setup_wandb() set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True)
def __init__( self, tokenizer: PreTrainedTokenizer, model: PreTrainedModel, device: str = "cpu", batch_size: int = 32, add_special_tokens: bool = False, normalize: bool = False, max_token_limit: int = 128, ): # Load pre-trained model tokenizer (vocabulary) self.tokenizer = tokenizer self.device = torch.device(device) self.model = model.to(self.device).eval() self.batch_size = batch_size self.normalize = normalize self._add_special_tokens = add_special_tokens self.max_token_limit = max_token_limit
def __init__( self, transformer: PreTrainedModel, tokenizer: PreTrainedTokenizer, do_cls_tokens: bool = False, do_max_tokens: bool = False, do_sqrt_tokens: bool = False, do_mean_tokens: bool = True, device: str = 'cpu', ): super(SentenceEncoder, self).__init__() self.device = torch.device(device) self.transformer = transformer.to(self.device) self.pooling = SentenceEncoderPooling( do_cls_tokens=do_cls_tokens, do_max_tokens=do_max_tokens, do_sqrt_tokens=do_sqrt_tokens, do_mean_tokens=do_mean_tokens, hidden_size=transformer.config.hidden_size, ) self.tokenizer = tokenizer
def __init__( self, model: transformers.PreTrainedModel, tokenizer: transformers.PreTrainedTokenizerFast, model_name: str, collect_activations_flag: Optional[bool] = False, collect_activations_layer_nums: Optional[ List[int]] = None, # None --> collect for all layers verbose: Optional[bool] = True, gpu: Optional[bool] = True): """ Creates an LM object given a model and tokenizer. Args: model: HuggingFace Transformers Pytorch language model. tokenizer: The tokenizer associated with the model model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called) collect_activations_flag: True if we want to collect activations collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers to track. By default this would be None and we'd collect activations for all layers. verbose: If True, model.generate() displays output tokens in HTML as they're generated. gpu: Set to False to force using the CPU even if a GPU exists. """ self.model_name = model_name self.model = model if torch.cuda.is_available() and gpu: self.model = model.to('cuda') self.device = 'cuda' if torch.cuda.is_available() \ and self.model.device.type == 'cuda' \ else 'cpu' self.tokenizer = tokenizer self.verbose = verbose self._path = os.path.dirname(ecco.__file__) # Neuron Activation self.collect_activations_flag = collect_activations_flag self.collect_activations_layer_nums = collect_activations_layer_nums # For each model, this indicates the layer whose activations # we will collect configs = yaml.safe_load( open(os.path.join(self._path, "model-config.yaml"))) try: self.model_config = configs[self.model_name] self.model_embeddings = self.model_config['embedding'] embeddings_layer_name = self.model_config['embedding'] embed_retriever = attrgetter(embeddings_layer_name) self.model_embeddings = embed_retriever(self.model) self.collect_activations_layer_name_sig = self.model_config[ 'activations'][0] except KeyError: raise ValueError( f"The model '{self.model_name}' is not defined in Ecco's 'model-config.yaml' file and" f" so is not explicitly supported yet. Supported models are:", list(configs.keys())) from KeyError() self._hooks = {} self._reset() self._attach_hooks(self.model)
def train(self, model: transformers.PreTrainedModel, training_tasks: typing.List[Task], validation_tasks: typing.List[Task], num_epochs: int, batch_size: int, steps_per_epoch: int, prefetch_size: int, eval_batch_size: typing.Optional[int] = None, eval_batches: typing.Optional[int] = None, checkpoint_file: typing.Optional[str] = None) -> None: logging.info('Preparing kitchen sink with %d training tasks: %s', len(training_tasks), training_tasks) # Train the model & return its training history logging.info('Beginning training...') training_data, data_sizes = self.load_train_data(training_tasks, batch_size=batch_size, prefetch_size=prefetch_size) if validation_tasks: logging.info('Preparing kitchen sink with %d validation tasks: %s', len(validation_tasks), validation_tasks) validation_data = self.load_valid_data(validation_tasks, batch_size=eval_batch_size or batch_size, prefetch_size=prefetch_size, num_batches=eval_batches) else: validation_data = None logging.info('Preparing kitchen sink without validation') num_epochs += self.warmup_epochs optimizer, scheduler = get_optimizer(model, num_warmup_steps=self.warmup_epochs * steps_per_epoch, num_training_steps=num_epochs * steps_per_epoch) model.to(self.device) if self.use_amp: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level='O1') global_step = 0 tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_itr = tqdm.trange(0, num_epochs * steps_per_epoch, desc="Training", unit="batch") tasks = [task.dataset for task in training_tasks] mixing_rates = self.get_mixing_rate(tasks, data_sizes) total_task_steps = Counter({task: np.float32(0.) for task in tasks}) for epoch in range(1, num_epochs + 1): epoch_itr = tqdm.trange(0, steps_per_epoch, desc="Epoch %d" % epoch, leave=False, unit="batch") epoch_task_steps = Counter({task: np.float32(0.) for task in tasks}) running_task_losses = {task: np.float32(0.) for task in tasks} for step in epoch_itr: inputs, labels, _ = next(np.random.choice(training_data, p=mixing_rates)) step_loss = self._train_step(model, inputs, labels, optimizer) tr_loss += step_loss train_itr.update() task = inputs['task'][0].decode('UTF-8') epoch_task_steps[task] += 1 running_task_losses[task] += step_loss if (step + 1) % self.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps self.gradient_accumulation_steps >= steps_per_epoch == (step + 1)): if self.use_amp: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 total_tasks = sum(epoch_task_steps.values()) print('Epoch %d: Empirical Mixing Rates: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100. / total_tasks) for task, rate in epoch_task_steps.items()) )) print('Epoch %d: Expected Mixing Rates: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100.) for task, rate in zip(tasks, mixing_rates)) )) mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()] print('Epoch %d: Training Losses: %s' % ( epoch, '; '.join('{:s}: {:g}'.format(task, loss) for task, loss in zip(tasks, mixing_losses)) )) if epoch > self.warmup_epochs: total_task_steps += epoch_task_steps exploration_ratios = np.array([total_task_steps.get(task, np.float32(0)) / size for task, size in zip(tasks, data_sizes)]) print('Epoch %d: Exploration Ratios: %s' % ( epoch, '; '.join('{:s}: {:0>5.2f}%'.format(task, ratio * 100.) for task, ratio in zip(tasks, exploration_ratios)) )) if not self.mix_from_validation: avg_loss = np.nanmean(mixing_losses) mixing_losses = [er * loss + (1. - er) * avg_loss for er, loss in zip(exploration_ratios, np.nan_to_num(mixing_losses))] valid_steps = 0 running_valid_loss = 0. if validation_data: epoch_task_steps = {task: np.float32(0.) for task in tasks} running_task_losses = {task: np.float32(0.) for task in tasks} with torch.no_grad(): for step, (inputs, labels, _) in enumerate(validation_data.as_numpy_iterator(), 1): model.eval() # Run the forward pass valid_step_loss = model(**self.prepare_forward_inputs(model, inputs, labels))[0].item() running_valid_loss += valid_step_loss valid_task = inputs['task'][0].decode('UTF-8') if valid_task in tasks: epoch_task_steps[valid_task] += 1 running_task_losses[valid_task] += valid_step_loss valid_steps += 1 avg_val_loss = running_valid_loss / valid_steps # Save checkpoint if validation loss decreases and checkpoint dir has been provided if checkpoint_file: if epoch == 1: best_val_loss = avg_val_loss logging.info("Saving best model with initial validation loss {0})".format(best_val_loss)) self.save_model(model, "{0}_best".format(checkpoint_file)) else: if avg_val_loss < best_val_loss: best_val_loss = avg_val_loss logging.info( "Saving new best model with validation loss {0} (epoch {1})".format(best_val_loss, epoch)) self.save_model(model, "{0}_best".format(checkpoint_file)) print('Epoch {:d}: Validation Losses: {:s}'.format( epoch, '; '.join('{:s}: {:g}'.format(task, loss / epoch_task_steps[task]) for task, loss in running_task_losses.items()) )) if self.mix_from_validation: mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()] if epoch > self.warmup_epochs and self.dynamic_mixing: new_mixing_rates = self.get_mixing_rate( tasks=tasks, rates=mixing_losses, normalize=False, temperature=(1. / self.temperature) ) print('Epoch {:d}: Updating Mixing Rate: {:s}'.format( epoch, '; '.join( '{:s}: {:0>5.2f}%->{:0>5.2f}% (Δ={:0>5.2f})'.format( task, old_rate * 100., smooth_rate * 100., (smooth_rate-old_rate) * 100.) for task, old_rate, smooth_rate in zip(tasks, mixing_rates, new_mixing_rates)) )) mixing_rates = new_mixing_rates logging.debug('Mixing rates (shape=%s; |tasks|=%d): %s', mixing_rates.shape, len(tasks), mixing_rates) lr = scheduler.get_last_lr()[0] loss_scalar = (tr_loss - logging_loss) / steps_per_epoch logging_loss = tr_loss train_itr.write('Global step: %d, lr: %g, loss: %g, val_loss: %g' % ( global_step, lr, loss_scalar, running_valid_loss / valid_steps if valid_steps > 0 else np.NaN)) if not np.isfinite(loss_scalar): logging.info('Loss was NaN, ending training after %d epochs.', epoch) train_itr.close() return train_itr.close()