Esempio n. 1
0
    def __init__(
        self,
        model: str,
        custom_model: PreTrainedModel = None,
        custom_tokenizer: PreTrainedTokenizer = None,
    ):
        """
        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used.
        :param custom_model: This is optional if a custom bert model is used.
        :param custom_tokenizer: Place to use custom tokenizer.
        """
        base_model, base_tokenizer = self.MODELS.get(model, (None, None))

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        if custom_model:
            self.model = custom_model.to(self.device)
        else:
            self.model = base_model.from_pretrained(
                model, output_hidden_states=True).to(self.device)

        if custom_tokenizer:
            self.tokenizer = custom_tokenizer
        else:
            self.tokenizer = base_tokenizer.from_pretrained(model)

        self.model.eval()
Esempio n. 2
0
    def __init__(self,
                 model: transformers.PreTrainedModel,
                 tokenizer: transformers.PreTrainedTokenizerFast,
                 model_name: str,
                 config: Dict[str, Any],
                 collect_activations_flag: Optional[bool] = False,
                 collect_activations_layer_nums: Optional[List[int]] = None,  # None --> collect for all layers
                 verbose: Optional[bool] = True,
                 gpu: Optional[bool] = True
                 ):
        """
        Creates an LM object given a model and tokenizer.

        Args:
            model: HuggingFace Transformers Pytorch language model.
            tokenizer: The tokenizer associated with the model
            model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called)
            config: Configuration that has the information about the layer whose activations we will collect
            collect_activations_flag: True if we want to collect activations
            collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers
                to track. By default this would be None and we'd collect activations for all layers.
            verbose: If True, model.generate() displays output tokens in HTML as they're generated.
            gpu: Set to False to force using the CPU even if a GPU exists.
        """
        self.model_name = model_name
        self.model = model
        if torch.cuda.is_available() and gpu:
            self.model = model.to('cuda')

        self.device = 'cuda' if torch.cuda.is_available() \
                                and self.model.device.type == 'cuda' \
            else 'cpu'

        self.tokenizer = tokenizer
        self.verbose = verbose
        self._path = os.path.dirname(ecco.__file__)

        # Neuron Activation
        self.collect_activations_flag = collect_activations_flag
        self.collect_activations_layer_nums = collect_activations_layer_nums

        # For each model, this indicates the layer whose activations
        # we will collect
        self.model_config = config
        try:
            self.model_type = self.model_config['type']
            embeddings_layer_name = self.model_config['embedding']
            embed_retriever = attrgetter(embeddings_layer_name)
            self.model_embeddings = embed_retriever(self.model)
            self.collect_activations_layer_name_sig = self.model_config['activations'][0]
        except KeyError:
            raise ValueError(
                   f"The model '{self.model_name}' is not correctly configured in Ecco's 'model-config.yaml' file"
            ) from KeyError()

        assert self.model_type in ['causal', 'mlm', 'enc-dec'], f"model type {self.model_type} not found"

        self._reset()
Esempio n. 3
0
def get_number_perfect_predictions(model: PreTrainedModel,
                                   tokenizer: PreTrainedTokenizer,
                                   eval_data_file):
    labels_file = str(eval_data_file).replace('masked_code_', 'mask_')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Inputs
    with open(eval_data_file) as f:
        inputs = f.readlines()
    inputs = [x.strip() for x in inputs]

    # Targets
    with open(labels_file) as f:
        targets = f.readlines()
    targets = [x.strip() for x in targets]

    n_perfect_predictions = 0
    i = 0
    while i < len(inputs):
        input = inputs[i]
        target = "".join(targets[i].split()).replace('<z>', '')

        indexed_tokens = tokenizer.encode(input)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(device)
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0]

        predicted_sentence = []
        for token in torch.argmax(predictions[0], 1).cpu().numpy():
            if token != tokenizer.convert_tokens_to_ids('<z>'):
                predicted_sentence.append(token)
            else:
                break

        prediction = tokenizer.decode(predicted_sentence)
        prediction = "".join(prediction.split())
        if target == prediction:
            n_perfect_predictions += 1
        i += 1

    return n_perfect_predictions, len(inputs)
Esempio n. 4
0
    def __init__(self,
                 model: PreTrainedModel,
                 args: TrainingArguments,
                 data_collator: Optional[DataCollator] = None,
                 train_dataset: Optional[Dataset] = None,
                 eval_dataset: Optional[Dataset] = None,
                 compute_metrics: Optional[Callable[[EvalPrediction],
                                                    Dict]] = None,
                 prediction_loss_only=False,
                 tb_writer: Optional["SummaryWriter"] = None,
                 optimizers: Tuple[torch.optim.Optimizer,
                                   torch.optim.lr_scheduler.LambdaLR] = None,
                 mi_estimator: Optional[CLUB] = None):
        """
        Trainer is a simple but feature-complete training and eval loop for PyTorch,
        optimized for Transformers.

        Args:
            prediction_loss_only:
                (Optional) in evaluation and prediction, only return the loss
        """
        self.model = model.to(args.device)
        self.args = args
        if data_collator is not None:
            self.data_collator = data_collator
        else:
            self.data_collator = DefaultDataCollator()
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        self.prediction_loss_only = prediction_loss_only
        self.mi_estimator = mi_estimator
        self.optimizers = optimizers
        if tb_writer is not None:
            self.tb_writer = tb_writer
        elif is_tensorboard_available() and self.is_world_master():
            self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
        if not is_tensorboard_available():
            logger.warning(
                "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
            )
        if is_wandb_available():
            self._setup_wandb()
        else:
            logger.info(
                "You are instantiating a Trainer but W&B is not installed. To use wandb logging, "
                "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface."
            )
        set_seed(self.args.seed)
        # Create output directory if needed
        if self.is_world_master():
            os.makedirs(self.args.output_dir, exist_ok=True)
        if is_tpu_available():
            # Set an xla_device flag on the model's config.
            # We'll find a more elegant and not need to do this in the future.
            self.model.config.xla_device = True
Esempio n. 5
0
    def predict_task_split(self,
                           model: transformers.PreTrainedModel,
                           inputs: tf.data.Dataset,
                           task: Task,
                           max_length: int = 140,
                           min_length: int = 55) -> typing.Sequence[typing.Sequence[int]]:

        try:
            outputs = []
            model.to(self.device)
            for batch_inputs in tqdm.tqdm(inputs.as_numpy_iterator(),
                                          desc="Predicting %s" % task,
                                          unit="batch", leave=False):
                with torch.no_grad():
                    model.eval()
                    forward_params = self.prepare_forward_inputs(model, batch_inputs)
                    batch_outputs = model.generate(forward_params['input_ids'],
                                                   attention_mask=forward_params['attention_mask'],
                                                   do_sample=False,
                                                   max_length=GENERATION_MAX_LENGTHS.get(task.dataset, max_length) + 2,
                                                   min_length=GENERATION_MIN_LENGTHS.get(task.dataset, min_length) + 1,
                                                   num_beams=4,
                                                   length_penalty=2.,
                                                   no_repeat_ngram_size=3,
                                                   early_stopping=True)

                    batch_outputs = batch_outputs.detach().cpu().numpy()
                    outputs.extend(batch_outputs)
            return outputs
        # We can't just except tf.errors.UnknownError, because it is thrown as some sort of weird proxy
        # instance of a tf.errors.UnknownError and python's pattern matching can't handle the scandal
        except Exception as e:
            if isinstance(e, tf.errors.UnknownError):
                logging.warning('Encountered error: %s on %s: %s', type(e), task, e)
                # Unfortunately, we don't get a more helpful error type, but this usually means
                # that the dataset has no labels for a given split (e.g., test evaluation occurs on a server)
                return []
            else:
                # We got a different exception type so let python freak out accordingly
                logging.error('Encountered error: %s on %s: %s', type(e), task, e)
                raise e
Esempio n. 6
0
def train_model(df: pd.DataFrame, tokenizer: transformers.PreTrainedTokenizer,
                model: transformers.PreTrainedModel, steps: int, batch_size: int, save_path:str) -> None:
    
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    #device = torch.device('cpu')
    model.to(device)
    model.train()

    optim = AdamW(model.parameters(), lr=5e-5)

    losses = []
    for step in trange(steps):
    
        optim.zero_grad()

        sample = df.sample(batch_size)

        X = sample['articles'].tolist()
        y = sample['labels'].tolist()
        
        inputs = tokenizer(X, return_tensors='pt', padding=True, truncation=True)
        
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = torch.tensor(y).unsqueeze(1).to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        losses.append(loss)
        
        if (step + 1) % 100 == 0:
            print(f'Step: {step + 1} Loss: {sum(losses)/len(losses)}')
            send_message(f'Step: {step + 1} Loss: {sum(losses)/len(losses)}')
            losses = []
            
        loss.backward()
        optim.step()

    model.save_pretrained(save_path)
Esempio n. 7
0
 def __init__(
     self,
     tokenizer: PreTrainedTokenizer,
     model: PreTrainedModel,
     device: str = "cpu",
     batch_size: int = 1,
     normalize: bool = False,
 ):
     # Load pre-trained model tokenizer (vocabulary)
     self.tokenizer = tokenizer
     self.device = torch.device(device)
     self.model = model.to(self.device).eval()
     self.batch_size = batch_size
     self.normalize = normalize
     self._loss_fn = CrossEntropyLoss(ignore_index=-1)
Esempio n. 8
0
 def __init__(
     self,
     model: PreTrainedModel,
     args: TrainingArguments,
     device,
     data_collator: Optional[DataCollator] = None,
     train_dataset: Optional[Dataset] = None,
     eval_dataset: Optional[Dataset] = None,
     compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
     prediction_loss_only=False,
     tb_writer: Optional["SummaryWriter"] = None,
     optimizers: Tuple[torch.optim.Optimizer,
                       torch.optim.lr_scheduler.LambdaLR] = None,
 ):
     """
     Trainer is a simple but feature-complete training and eval loop for PyTorch,
     optimized for Transformers.
     Args:
         prediction_loss_only:
             (Optional) in evaluation and prediction, only return the loss
     """
     self.model = model.to(device)
     self.args = args
     self.device = device
     if data_collator is not None:
         self.data_collator = data_collator
     else:
         self.data_collator = DefaultDataCollator()
     self.train_dataset = train_dataset
     self.eval_dataset = eval_dataset
     self.compute_metrics = compute_metrics
     self.prediction_loss_only = prediction_loss_only
     self.optimizers = optimizers
     if tb_writer is not None:
         self.tb_writer = tb_writer
     elif is_tensorboard_available() and self.is_world_master():
         self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
     if not is_tensorboard_available():
         logger.warning(
             "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
         )
     self._setup_wandb()
     set_seed(self.args.seed)
     # Create output directory if needed
     if self.is_world_master():
         os.makedirs(self.args.output_dir, exist_ok=True)
Esempio n. 9
0
 def __init__(
     self,
     tokenizer: PreTrainedTokenizer,
     model: PreTrainedModel,
     device: str = "cpu",
     batch_size: int = 32,
     add_special_tokens: bool = False,
     normalize: bool = False,
     max_token_limit: int = 128,
 ):
     # Load pre-trained model tokenizer (vocabulary)
     self.tokenizer = tokenizer
     self.device = torch.device(device)
     self.model = model.to(self.device).eval()
     self.batch_size = batch_size
     self.normalize = normalize
     self._add_special_tokens = add_special_tokens
     self.max_token_limit = max_token_limit
Esempio n. 10
0
 def __init__(
     self,
     transformer: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
     do_cls_tokens: bool = False,
     do_max_tokens: bool = False,
     do_sqrt_tokens: bool = False,
     do_mean_tokens: bool = True,
     device: str = 'cpu',
 ):
     super(SentenceEncoder, self).__init__()
     self.device = torch.device(device)
     self.transformer = transformer.to(self.device)
     self.pooling = SentenceEncoderPooling(
         do_cls_tokens=do_cls_tokens,
         do_max_tokens=do_max_tokens,
         do_sqrt_tokens=do_sqrt_tokens,
         do_mean_tokens=do_mean_tokens,
         hidden_size=transformer.config.hidden_size,
     )
     self.tokenizer = tokenizer
Esempio n. 11
0
    def __init__(
            self,
            model: transformers.PreTrainedModel,
            tokenizer: transformers.PreTrainedTokenizerFast,
            model_name: str,
            collect_activations_flag: Optional[bool] = False,
            collect_activations_layer_nums: Optional[
                List[int]] = None,  # None --> collect for all layers
            verbose: Optional[bool] = True,
            gpu: Optional[bool] = True):
        """
        Creates an LM object given a model and tokenizer.

        Args:
            model: HuggingFace Transformers Pytorch language model.
            tokenizer: The tokenizer associated with the model
            model_name: The name of the model. Used to retrieve required settings (like what the embedding layer is called)
            collect_activations_flag: True if we want to collect activations
            collect_activations_layer_nums: If collecting activations, we can use this parameter to indicate which layers
                to track. By default this would be None and we'd collect activations for all layers.
            verbose: If True, model.generate() displays output tokens in HTML as they're generated.
            gpu: Set to False to force using the CPU even if a GPU exists.
        """
        self.model_name = model_name
        self.model = model
        if torch.cuda.is_available() and gpu:
            self.model = model.to('cuda')

        self.device = 'cuda' if torch.cuda.is_available() \
                                and self.model.device.type == 'cuda' \
            else 'cpu'

        self.tokenizer = tokenizer
        self.verbose = verbose
        self._path = os.path.dirname(ecco.__file__)

        # Neuron Activation
        self.collect_activations_flag = collect_activations_flag
        self.collect_activations_layer_nums = collect_activations_layer_nums

        # For each model, this indicates the layer whose activations
        # we will collect
        configs = yaml.safe_load(
            open(os.path.join(self._path, "model-config.yaml")))

        try:
            self.model_config = configs[self.model_name]
            self.model_embeddings = self.model_config['embedding']
            embeddings_layer_name = self.model_config['embedding']
            embed_retriever = attrgetter(embeddings_layer_name)
            self.model_embeddings = embed_retriever(self.model)
            self.collect_activations_layer_name_sig = self.model_config[
                'activations'][0]
        except KeyError:
            raise ValueError(
                f"The model '{self.model_name}' is not defined in Ecco's 'model-config.yaml' file and"
                f" so is not explicitly supported yet. Supported models are:",
                list(configs.keys())) from KeyError()

        self._hooks = {}
        self._reset()
        self._attach_hooks(self.model)
Esempio n. 12
0
    def train(self,
              model: transformers.PreTrainedModel,
              training_tasks: typing.List[Task],
              validation_tasks: typing.List[Task],
              num_epochs: int,
              batch_size: int,
              steps_per_epoch: int,
              prefetch_size: int,
              eval_batch_size: typing.Optional[int] = None,
              eval_batches: typing.Optional[int] = None,
              checkpoint_file: typing.Optional[str] = None) -> None:
        logging.info('Preparing kitchen sink with %d training tasks: %s', len(training_tasks), training_tasks)

        # Train the model & return its training history
        logging.info('Beginning training...')
        training_data, data_sizes = self.load_train_data(training_tasks,
                                                         batch_size=batch_size,
                                                         prefetch_size=prefetch_size)

        if validation_tasks:
            logging.info('Preparing kitchen sink with %d validation tasks: %s', len(validation_tasks), validation_tasks)
            validation_data = self.load_valid_data(validation_tasks,
                                                   batch_size=eval_batch_size or batch_size,
                                                   prefetch_size=prefetch_size,
                                                   num_batches=eval_batches)
        else:
            validation_data = None
            logging.info('Preparing kitchen sink without validation')

        num_epochs += self.warmup_epochs
        optimizer, scheduler = get_optimizer(model,
                                             num_warmup_steps=self.warmup_epochs * steps_per_epoch,
                                             num_training_steps=num_epochs * steps_per_epoch)

        model.to(self.device)
        if self.use_amp:
            if not is_apex_available():
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

        global_step = 0
        tr_loss = 0.0
        logging_loss = 0.0
        model.zero_grad()
        train_itr = tqdm.trange(0, num_epochs * steps_per_epoch, desc="Training", unit="batch")
        tasks = [task.dataset for task in training_tasks]
        mixing_rates = self.get_mixing_rate(tasks, data_sizes)
        total_task_steps = Counter({task: np.float32(0.) for task in tasks})
        for epoch in range(1, num_epochs + 1):
            epoch_itr = tqdm.trange(0, steps_per_epoch, desc="Epoch %d" % epoch, leave=False, unit="batch")
            epoch_task_steps = Counter({task: np.float32(0.) for task in tasks})
            running_task_losses = {task: np.float32(0.) for task in tasks}
            for step in epoch_itr:
                inputs, labels, _ = next(np.random.choice(training_data, p=mixing_rates))
                step_loss = self._train_step(model, inputs, labels, optimizer)
                tr_loss += step_loss
                train_itr.update()
                task = inputs['task'][0].decode('UTF-8')
                epoch_task_steps[task] += 1
                running_task_losses[task] += step_loss

                if (step + 1) % self.gradient_accumulation_steps == 0 or (
                        # last step in epoch but step is always smaller than gradient_accumulation_steps
                        self.gradient_accumulation_steps >= steps_per_epoch == (step + 1)):
                    if self.use_amp:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)

                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

            total_tasks = sum(epoch_task_steps.values())

            print('Epoch %d: Empirical Mixing Rates: %s' % (
                epoch,
                '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100. / total_tasks)
                          for task, rate in epoch_task_steps.items())
            ))

            print('Epoch %d: Expected Mixing Rates: %s' % (
                epoch,
                '; '.join('{:s}: {:0>5.2f}%'.format(task, rate * 100.)
                          for task, rate in zip(tasks, mixing_rates))
            ))

            mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()]
            print('Epoch %d: Training Losses: %s' % (
                epoch,
                '; '.join('{:s}: {:g}'.format(task, loss) for task, loss in zip(tasks, mixing_losses))
            ))

            if epoch > self.warmup_epochs:
                total_task_steps += epoch_task_steps
                exploration_ratios = np.array([total_task_steps.get(task, np.float32(0)) / size
                                               for task, size in zip(tasks, data_sizes)])
                print('Epoch %d: Exploration Ratios: %s' % (
                    epoch,
                    '; '.join('{:s}: {:0>5.2f}%'.format(task, ratio * 100.)
                              for task, ratio in zip(tasks, exploration_ratios))
                ))

                if not self.mix_from_validation:
                    avg_loss = np.nanmean(mixing_losses)
                    mixing_losses = [er * loss + (1. - er) * avg_loss
                                     for er, loss in zip(exploration_ratios, np.nan_to_num(mixing_losses))]

            valid_steps = 0
            running_valid_loss = 0.
            if validation_data:
                epoch_task_steps = {task: np.float32(0.) for task in tasks}
                running_task_losses = {task: np.float32(0.) for task in tasks}
                with torch.no_grad():
                    for step, (inputs, labels, _) in enumerate(validation_data.as_numpy_iterator(), 1):
                        model.eval()
                        # Run the forward pass
                        valid_step_loss = model(**self.prepare_forward_inputs(model, inputs, labels))[0].item()
                        running_valid_loss += valid_step_loss
                        valid_task = inputs['task'][0].decode('UTF-8')
                        if valid_task in tasks:
                            epoch_task_steps[valid_task] += 1
                            running_task_losses[valid_task] += valid_step_loss
                        valid_steps += 1

                avg_val_loss = running_valid_loss / valid_steps
                # Save checkpoint if validation loss decreases and checkpoint dir has been provided
                if checkpoint_file:
                    if epoch == 1:
                        best_val_loss = avg_val_loss
                        logging.info("Saving best model with initial validation loss {0})".format(best_val_loss))
                        self.save_model(model, "{0}_best".format(checkpoint_file))
                    else:
                        if avg_val_loss < best_val_loss:
                            best_val_loss = avg_val_loss
                            logging.info(
                                "Saving new best model with validation loss {0} (epoch {1})".format(best_val_loss,
                                                                                                    epoch))
                            self.save_model(model, "{0}_best".format(checkpoint_file))

                print('Epoch {:d}: Validation Losses: {:s}'.format(
                    epoch,
                    '; '.join('{:s}: {:g}'.format(task, loss / epoch_task_steps[task])
                              for task, loss in running_task_losses.items())
                ))

                if self.mix_from_validation:
                    mixing_losses = [loss / epoch_task_steps[task] for task, loss in running_task_losses.items()]

            if epoch > self.warmup_epochs and self.dynamic_mixing:
                new_mixing_rates = self.get_mixing_rate(
                    tasks=tasks,
                    rates=mixing_losses,
                    normalize=False,
                    temperature=(1. / self.temperature)
                )
                print('Epoch {:d}: Updating Mixing Rate: {:s}'.format(
                    epoch,
                    '; '.join(
                        '{:s}: {:0>5.2f}%->{:0>5.2f}% (Δ={:0>5.2f})'.format(
                            task,
                            old_rate * 100.,
                            smooth_rate * 100.,
                            (smooth_rate-old_rate) * 100.)
                        for task, old_rate, smooth_rate in zip(tasks, mixing_rates, new_mixing_rates))
                ))
                mixing_rates = new_mixing_rates
                logging.debug('Mixing rates (shape=%s; |tasks|=%d): %s', mixing_rates.shape, len(tasks), mixing_rates)

            lr = scheduler.get_last_lr()[0]
            loss_scalar = (tr_loss - logging_loss) / steps_per_epoch
            logging_loss = tr_loss
            train_itr.write('Global step: %d, lr: %g, loss: %g, val_loss: %g' % (
                global_step,
                lr,
                loss_scalar,
                running_valid_loss / valid_steps if valid_steps > 0 else np.NaN))

            if not np.isfinite(loss_scalar):
                logging.info('Loss was NaN, ending training after %d epochs.', epoch)
                train_itr.close()
                return

        train_itr.close()