コード例 #1
0
    def __init__(
        self,
        model: PreTrainedModel,
        args: TrainingArguments,
        data_collator: Optional[DataCollator] = None,
        train_dataset: Optional[Dataset] = None,
        eval_dataset: Optional[Dataset] = None,
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        prediction_loss_only=False,
        tb_writer: Optional["SummaryWriter"] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None,
    ):
        """
        Trainer is a simple but feature-complete training and eval loop for PyTorch,
        optimized for Transformers.

        Args:
            prediction_loss_only:
                (Optional) in evaluation and prediction, only return the loss
        """
        self.model = model.to(args.device)
        self.args = args
        self.data_collator = data_collator if data_collator is not None else default_data_collator
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        self.prediction_loss_only = prediction_loss_only
        self.optimizers = optimizers
        if tb_writer is not None:
            self.tb_writer = tb_writer
        elif is_tensorboard_available() and self.is_world_master():
            self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
        if not is_tensorboard_available():
            logger.warning(
                "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it."
            )
        if is_wandb_available():
            self._setup_wandb()
        else:
            logger.info(
                "You are instantiating a Trainer but W&B is not installed. To use wandb logging, "
                "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface."
            )
        set_seed(self.args.seed)
        # Create output directory if needed
        if self.is_world_master():
            os.makedirs(self.args.output_dir, exist_ok=True)
        if is_torch_tpu_available():
            # Set an xla_device flag on the model's config.
            # We'll find a more elegant and not need to do this in the future.
            self.model.config.xla_device = True
        if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
            self.data_collator = self.data_collator.collate_batch
            warnings.warn(
                (
                    "The `data_collator` should now be a simple callable (function, class with `__call__`), classes "
                    + "with a `collate_batch` are deprecated and won't be supported in a future version."
                ),
                FutureWarning,
            )
コード例 #2
0
ファイル: distilbert.py プロジェクト: sfschouten/court-of-xai
    def from_huggingface_model(cls, model: PreTrainedModel,
                               ffn_activation: str, ffn_dropout: float,
                               attention: Attention):
        config = model.config
        encoder = cls(n_layers=config.n_layers,
                      n_heads=config.n_heads,
                      dim=config.dim,
                      hidden_dim=config.hidden_dim,
                      ffn_activation=ffn_activation,
                      ffn_dropout=ffn_dropout,
                      attention=attention)
        # After creating the encoder, we copy weights over from the transformer.  This currently
        # requires that the internal structure of the text side of this encoder *exactly matches*
        # the internal structure of whatever transformer you're using.
        encoder_parameters = dict(encoder.named_parameters())
        for name, parameter in model.named_parameters():
            if name.startswith("transformer."):
                name = name.replace("LayerNorm", "layer_norm")
                if name not in encoder_parameters:
                    raise ValueError(
                        f"Couldn't find a matching parameter for {name}. Is this transformer "
                        "compatible with the joint encoder you're using?")
                encoder_parameters[name].data.copy_(parameter.data)

        return encoder
コード例 #3
0
 def __init__(
     self,
     model: PreTrainedModel,
     args: TrainingArguments,
     data_collator: Optional[DataCollator] = None,
     train_dataset: Optional[Dataset] = None,
     eval_dataset: Optional[Dataset] = None,
     compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
     prediction_loss_only=False,
     optimizers: Tuple[torch.optim.Optimizer,
                       torch.optim.lr_scheduler.LambdaLR] = None,
 ):
     self.model = model.to(args.device)
     self.fc = nn.Linear(768, 1839).to(args.device)
     self.args = args
     self.data_collator = data_collator if data_collator is not None else default_data_collator
     self.train_dataset = train_dataset
     self.eval_dataset = eval_dataset
     self.compute_metrics = compute_metrics
     self.prediction_loss_only = prediction_loss_only
     self.optimizers = optimizers
     set_seed(self.args.seed)
コード例 #4
0
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True):
    filename = "imagenet-1k-id2label.json"
    num_labels = 1000

    repo_id = "datasets/huggingface/label-files"
    num_labels = num_labels
    id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
    id2label = {int(k): v for k, v in id2label.items()}

    id2label = id2label
    label2id = {v: k for k, v in id2label.items()}

    ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id)

    names_to_config = {
        "regnet-y-10b-seer": ImageNetPreTrainedConfig(
            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
        ),
        # finetuned on imagenet
        "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig(
            depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010
        ),
    }

    # add seer weights logic
    def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]:
        files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu")
        # check if we have a head, if yes add it
        model_state_dict = files["classy_state_dict"]["base_model"]["model"]
        return model_state_dict["trunk"], model_state_dict["heads"]

    names_to_from_model = {
        "regnet-y-10b-seer": partial(
            load_using_classy_vision,
            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch",
        ),
        "regnet-y-10b-seer-in1k": partial(
            load_using_classy_vision,
            "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch",
        ),
    }

    from_to_ours_keys = get_from_to_our_keys(model_name)

    if not (save_directory / f"{model_name}.pth").exists():
        logger.info("Loading original state_dict.")
        from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]()
        from_state_dict = from_state_dict_trunk
        if "in1k" in model_name:
            # add the head
            from_state_dict = {**from_state_dict_trunk, **from_state_dict_head}
        logger.info("Done!")

        converted_state_dict = {}

        not_used_keys = list(from_state_dict.keys())
        regex = r"\.block.-part."
        # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it
        for key in from_state_dict.keys():
            # remove the weird "block[0,1]-part" from the key
            src_key = re.sub(regex, "", key)
            # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key
            dest_key = from_to_ours_keys[src_key]
            # store the parameter with our key
            converted_state_dict[dest_key] = from_state_dict[key]
            not_used_keys.remove(key)
        # check that all keys have been updated
        assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}"

        logger.info(f"The following keys were not used: {','.join(not_used_keys)}")

        # save our state dict to disk
        torch.save(converted_state_dict, save_directory / f"{model_name}.pth")

        del converted_state_dict
    else:
        logger.info("The state_dict was already stored on disk.")
    if push_to_hub:
        logger.info(f"Token is {os.environ['HF_TOKEN']}")
        logger.info("Loading our model.")
        # create our model
        our_config = names_to_config[model_name]
        our_model_func = RegNetModel
        if "in1k" in model_name:
            our_model_func = RegNetForImageClassification
        our_model = our_model_func(our_config)
        # place our model to the meta device (so remove all the weights)
        our_model.to(torch.device("meta"))
        logger.info("Loading state_dict in our model.")
        # load state dict
        state_dict_keys = our_model.state_dict().keys()
        PreTrainedModel._load_pretrained_model_low_mem(
            our_model, state_dict_keys, [save_directory / f"{model_name}.pth"]
        )
        logger.info("Finally, pushing!")
        # push it to hub
        our_model.push_to_hub(
            repo_path_or_name=save_directory / model_name,
            commit_message="Add model",
            output_dir=save_directory / model_name,
        )
        size = 384
        # we can use the convnext one
        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size)
        feature_extractor.push_to_hub(
            repo_path_or_name=save_directory / model_name,
            commit_message="Add feature extractor",
            output_dir=save_directory / model_name,
        )
コード例 #5
0
def train(model: PreTrainedModel,
          train_dataloader: DataLoader,
          dev_dataloader: DataLoader,
          batch_size: int,
          gradient_accumulation_steps: int,
          device,
          num_train_epochs: int = 20,
          warmup_proportion: float = 0.1,
          learning_rate: float = 1e-5,
          patience: int = 5,
          output_dir: str = "/tmp/",
          model_file_name: str = "model.bin") -> str:
    """
    Trains a BERT Model on a set of training data, tuning it on a set of development data

    Args:
        model: the model that will be trained
        train_dataloader: a DataLoader with training data
        dev_dataloader: a DataLoader with development data (for early stopping)
        batch_size: the batch size for training
        gradient_accumulation_steps: the number of steps that gradients will be accumulated
        device: the device where training will take place ("cpu" or "cuda")
        num_train_epochs: the maximum number of training epochs
        warmup_proportion: the proportion of training steps for which the learning rate will be warmed up
        learning_rate: the initial learning rate
        patience: the number of epochs after which training will stop if no improvement on the dev
                  set is observed
        output_dir: the directory where the model will be saved
        model_file_name: the filename of the model file

    Returns: the path to the trained model

    """
    def warmup_linear(x, warmup=0.002):
        if x < warmup:
            return x / warmup
        return 1.0 - x

    output_model_file = os.path.join(output_dir, model_file_name)

    num_train_steps = int(
        len(train_dataloader.dataset) / batch_size /
        gradient_accumulation_steps * num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=learning_rate,
                      correct_bias=False)

    global_step = 0
    loss_history = []
    best_epoch = 0
    for epoch in trange(int(num_train_epochs), desc="Epoch"):

        model.train()
        tr_loss = 0
        for step, batch in enumerate(
                tqdm(train_dataloader, desc="Training iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            if type(model) == BertForSequenceClassification or type(
                    model) == BertForMultiLabelSequenceClassification:
                outputs = model(input_ids,
                                attention_mask=input_mask,
                                token_type_ids=segment_ids,
                                labels=label_ids)
            elif type(model) == DistilBertForSequenceClassification:
                outputs = model(input_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
            loss = outputs[0]

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()

            if (step + 1) % gradient_accumulation_steps == 0:
                lr_this_step = learning_rate * warmup_linear(
                    global_step / num_train_steps, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

        dev_loss, _, _ = evaluate(model, dev_dataloader, device)

        print("Loss history:", loss_history)
        print("Dev loss:", dev_loss)

        if len(loss_history) == 0 or dev_loss < min(loss_history):
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), output_model_file)
            best_epoch = epoch

        if epoch - best_epoch >= patience:
            print("No improvement on development set. Finish training.")
            break

        loss_history.append(dev_loss)

    return output_model_file
コード例 #6
0
def evaluate(model: PreTrainedModel, dataloader: DataLoader,
             device: str) -> (int, List[int], List[int]):
    """
    Evaluates a Bert Model on a labelled data set.

    Args:
        model: the BertModel to be evaluated
        dataloader: the DataLoader with the test data
        device: the device where evaluation will take place ("cpu" or "cuda")

    Returns: a tuple with (the evaluation loss, a list with the correct labels,
            and a list with the predicted labels)

    """

    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader,
                                      desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            if type(model) == BertForSequenceClassification or type(
                    model) == BertForMultiLabelSequenceClassification:
                tmp_eval_loss, logits = model(input_ids,
                                              attention_mask=input_mask,
                                              token_type_ids=segment_ids,
                                              labels=label_ids)
            elif type(model) == DistilBertForSequenceClassification:
                tmp_eval_loss, logits = model(input_ids,
                                              attention_mask=input_mask,
                                              labels=label_ids)

        if type(model) == BertForSequenceClassification or type(
                model) == DistilBertForSequenceClassification:
            outputs = np.argmax(logits.to('cpu'), axis=1)
            label_ids = label_ids.to('cpu').numpy()
            predicted_labels += list(outputs)

        elif type(model) == BertForMultiLabelSequenceClassification:
            sig = Sigmoid()
            outputs = sig(logits).to('cpu').numpy()
            label_ids = label_ids.to('cpu').numpy()
            predicted_labels += list(outputs >= 0.5)

        correct_labels += list(label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)

    return eval_loss, correct_labels, predicted_labels