Ejemplo n.º 1
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a language model either by supplying

        * the name of a remote model on s3 ("albert-base" ...)
        * or a local path of a model trained via transformers ("some_dir/huggingface_model")
        * or a local path of a model trained via FARM ("some_dir/farm_model")

        :param pretrained_model_name_or_path: name or path of a model
        :param language: (Optional) Name of language the model was trained for (e.g. "german").
                         If not supplied, FARM will try to infer it from the model name.
        :return: Language Model

        """
        albert = cls()
        if "farm_lm_name" in kwargs:
            albert.name = kwargs["farm_lm_name"]
        else:
            albert.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            config = AlbertConfig.from_pretrained(farm_lm_config)
            farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin"
            albert.model = AlbertModel.from_pretrained(farm_lm_model, config=config, **kwargs)
            albert.language = albert.model.config.language
        else:
            # Huggingface transformer Style
            albert.model = AlbertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs)
            albert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path)
        return albert
Ejemplo n.º 2
0
    def __init__(self, config, args):
        super().__init__(config)
        self.args = args

        if args.bert_model == "albert-base-v2":
            bert = AlbertModel.from_pretrained(args.bert_model)
        elif args.bert_model == "emilyalsentzer/Bio_ClinicalBERT":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12":
            bert = AutoModel.from_pretrained(args.bert_model)
        elif args.bert_model == "bert-small-scratch":
            config = BertConfig.from_pretrained(
                "google/bert_uncased_L-4_H-512_A-8")
            bert = BertModel(config)
        elif args.bert_model == "bert-base-scratch":
            config = BertConfig.from_pretrained("bert-base-uncased")
            bert = BertModel(config)
        else:
            bert = BertModel.from_pretrained(
                args.bert_model)  # bert-base-uncased, small, tiny

        self.txt_embeddings = bert.embeddings
        self.img_embeddings = ImageBertEmbeddings(args, self.txt_embeddings)

        if args.img_encoder == 'ViT':
            img_size = args.img_size
            patch_sz = 32 if img_size == 512 else 16
            self.img_encoder = Img_patch_embedding(image_size=img_size,
                                                   patch_size=patch_sz,
                                                   dim=2048)
        else:
            self.img_encoder = ImageEncoder_cnn(args)
            for p in self.img_encoder.parameters():
                p.requires_grad = False
            for c in list(self.img_encoder.children())[5:]:
                for p in c.parameters():
                    p.requires_grad = True

        self.encoder = bert.encoder
        self.pooler = bert.pooler
Ejemplo n.º 3
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 transformer_weights_model: str = None,
                 num_labels: int = 2,
                 predictions_file=None,
                 layer_freeze_regexes: List[str] = None,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._predictions = []

        self._pretrained_model = pretrained_model

        if 't5' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if transformer_weights_model:  # Override for RoBERTa only for now
                logging.info(f"Loading Transformer weights model from {transformer_weights_model}")
                transformer_model_loaded = load_archive(transformer_weights_model)
                self._transformer_model = transformer_model_loaded.model._transformer_model
            else:
                self._transformer_model = T5Model.from_pretrained(pretrained_model)
            self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob)
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if transformer_weights_model:  # Override for RoBERTa only for now
                logging.info(f"Loading Transformer weights model from {transformer_weights_model}")
                transformer_model_loaded = load_archive(transformer_weights_model)
                self._transformer_model = transformer_model_loaded.model._transformer_model
            else:
                self._transformer_model = RobertaModel.from_pretrained(pretrained_model)
            self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetModel.from_pretrained(pretrained_model)
            self.sequence_summary = SequenceSummary(self._transformer_model.config)
        elif 'albert' in pretrained_model:
            self._transformer_model = AlbertModel.from_pretrained(pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
            self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob)
        elif 'bert' in pretrained_model:
            self._transformer_model = BertModel.from_pretrained(pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
            self._dropout = torch.nn.Dropout(self._transformer_model.config.hidden_dropout_prob)
        else:
            assert (ValueError)

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any([bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = num_labels
        self._output_dim = self._transformer_model.config.hidden_size

        # unifing all model classification layer
        self._classifier = Linear(self._output_dim, num_labels)
        self._classifier.weight.data.normal_(mean=0.0, std=0.02)
        self._classifier.bias.data.zero_()

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()

        self._debug = -1
Ejemplo n.º 4
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 probe_type: str = None,
                 layer_freeze_regexes: List[str] = None,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._pretrained_model = pretrained_model
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            self._transformer_model = RobertaModel.from_pretrained(
                pretrained_model)
            self._dropout = torch.nn.Dropout(
                self._transformer_model.config.hidden_dropout_prob)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetModel.from_pretrained(
                pretrained_model)
            self.sequence_summary = SequenceSummary(
                self._transformer_model.config)
        elif 'albert' in pretrained_model:
            self._transformer_model = AlbertModel.from_pretrained(
                pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
            self._dropout = torch.nn.Dropout(
                self._transformer_model.config.hidden_dropout_prob)
        elif 'bert' in pretrained_model:
            self._transformer_model = BertModel.from_pretrained(
                pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
            self._dropout = torch.nn.Dropout(
                self._transformer_model.config.hidden_dropout_prob)
        else:
            assert (ValueError)

        if probe_type == 'MLP':
            layer_freeze_regexes = ["embeddings", "encoder"]

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = 1
        self._output_dim = self._transformer_model.config.hidden_size

        # unifing all model classification layer
        self._classifier = Linear(self._output_dim, 1)
        self._classifier.weight.data.normal_(mean=0.0, std=0.02)
        self._classifier.bias.data.zero_()

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
Ejemplo n.º 5
0
def wsd(
    model_name='bert-base-uncased',  #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased
    classifier_input='token-embedding-last-1-layers',  # token-embedding-last-layer / token-embedding-last-n-layers
    classifier_hidden_layers=[],
    reduce_options=True,
    freeze_base_model=True,
    max_len=512,
    batch_size=32,
    test=False,
    lr=5e-5,
    eps=1e-8,
    n_epochs=50,
    cls_token=False,  # If true, the cls token is used instead of the relevant-word token
    cache_embeddings=False,  # If true, the embeddings from the base model are saved to disk so that they only need to be computed once
    save_classifier=True  # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists
):
    train_path = "wsd_train.txt"
    test_path = "wsd_test_blind.txt"
    n_classes = 222
    device = 'cuda'

    import __main__ as main
    print("Script: " + os.path.basename(main.__file__))

    print("Loading base model %s..." % model_name)
    if model_name.startswith('ensemble-distil-'):
        last_n_distil = int(model_name.replace('ensemble-distil-', "")[0])
        last_n_albert = int(model_name[-1])
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        base_model = AlbertModel.from_pretrained('albert-xxlarge-v2',
                                                 output_hidden_states=True,
                                                 output_attentions=False)
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
        print(
            "Ensemble model with DistilBert last %d layers and Albert last %d layers"
            % (last_n_distil, last_n_albert))
    elif model_name.startswith('distilbert'):
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        base_model = DistilBertModel.from_pretrained(model_name,
                                                     num_labels=n_classes,
                                                     output_hidden_states=True,
                                                     output_attentions=False)
    elif model_name.startswith('bert'):
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        base_model = BertModel.from_pretrained(model_name,
                                               num_labels=n_classes,
                                               output_hidden_states=True,
                                               output_attentions=False)
    elif model_name.startswith('albert'):
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        base_model = AlbertModel.from_pretrained(model_name,
                                                 output_hidden_states=True,
                                                 output_attentions=False)

    use_n_last_layers = 1
    if classifier_input == 'token-embedding-last-layer':
        use_n_last_layers = 1
    elif classifier_input.startswith(
            'token-embedding-last-') and classifier_input.endswith('-layers'):
        use_n_last_layers = int(
            classifier_input.replace('token-embedding-last-',
                                     "").replace('-layers', ""))
    else:
        raise ValueError("Invalid classifier_input argument")
    print("Using the last %d layers" % use_n_last_layers)

    def tokenize(str):
        return tokenizer.tokenize(str)[:max_len - 2]

    SENSE = LabelField(is_target=True)
    LEMMA = LabelField()
    TOKEN_POS = LabelField(use_vocab=False)
    TEXT = Field(tokenize=tokenize,
                 pad_token=tokenizer.pad_token,
                 init_token=tokenizer.cls_token,
                 eos_token=tokenizer.sep_token)
    EXAMPLE_ID = LabelField(use_vocab=False)
    fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS),
              ('text', TEXT), ('example_id', EXAMPLE_ID)]

    def read_data(corpus_file, fields, max_len=None):
        train_id_start = 0
        test_id_start = 76049  # let the ids for the test examples start after the training example indices
        if corpus_file == "wsd_test_blind.txt":
            print("Loading test data...")
            id_start = test_id_start
        else:
            print("Loading train/val data...")
            id_start = train_id_start
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for i, line in enumerate(f):
                sense, lemma, word_position, text = line.split('\t')
                # We need to convert from the word position to the token position
                words = text.split()
                pre_word = " ".join(words[:int(word_position)])
                pre_word_tokenized = tokenizer.tokenize(pre_word)
                token_position = len(
                    pre_word_tokenized
                ) + 1  # taking into account the later addition of the start token
                example_id = id_start + i
                if max_len is None or token_position < max_len - 1:  # ignore examples where the relevant token is cut off due to max_len
                    if cls_token:
                        token_position = 0
                    examples.append(
                        Example.fromlist(
                            [sense, lemma, token_position, text, example_id],
                            fields))
                else:
                    print(
                        "Example %d is skipped because the relevant token was cut off (token pos = %d)"
                        % (example_id, token_position))
                    print(text)
        return Dataset(examples, fields)

    dataset = read_data(train_path, fields, max_len)
    random.seed(0)
    trn, vld = dataset.split(0.7, stratified=True, strata_field='sense')

    TEXT.build_vocab([])
    if model_name.startswith('albert') or model_name.startswith(
            'ensemble-distil-'):

        class Mapping:
            def __init__(self, fn):
                self.fn = fn

            def __getitem__(self, item):
                return self.fn(item)

        TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId)
        TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece)
    else:
        TEXT.vocab.stoi = tokenizer.vocab
        TEXT.vocab.itos = list(tokenizer.vocab)
    SENSE.build_vocab(trn)
    LEMMA.build_vocab(trn)

    trn_iter = BucketIterator(trn,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=True,
                              sort=True)
    vld_iter = BucketIterator(vld,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=False,
                              sort=True)

    if freeze_base_model:
        for mat in base_model.parameters():
            mat.requires_grad = False  # Freeze Bert model so that we only train the classifier on top

    if reduce_options:
        lemma_mask = defaultdict(
            lambda: torch.zeros(len(SENSE.vocab), device=device))
        for example in trn:
            lemma = LEMMA.vocab.stoi[example.lemma]
            sense = SENSE.vocab.stoi[example.sense]
            lemma_mask[lemma][sense] = 1
        lemma_mask = dict(lemma_mask)

        def mask(
            batch_logits, batch_lemmas
        ):  # Masks out the senses that do not belong to the specified lemma
            for batch_i in range(len(batch_logits)):
                lemma = batch_lemmas[batch_i].item()
                batch_logits[batch_i, :] *= lemma_mask[lemma]
            return batch_logits
    else:

        def mask(batch_logits, batch_lemmas):
            return batch_logits

    experiment_name = model_name + " " + (
        classifier_input if not model_name.startswith('ensemble-distil-') else
        "") + " " + str(classifier_hidden_layers) + " (" + (
            " cls_token" if cls_token else
            "") + (" reduce_options" if reduce_options else "") + (
                " freeze_base_model" if freeze_base_model else ""
            ) + "  ) " + "max_len=" + str(max_len) + " batch_size=" + str(
                batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + (
                    " cache_embeddings" if cache_embeddings else "")

    if model_name.startswith('ensemble-distil-'):
        model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask,
                                 classifier_hidden_layers)
    else:
        model = WSDModel(base_model, n_classes, mask, use_n_last_layers,
                         model_name, classifier_hidden_layers,
                         cache_embeddings)
    history = None
    #if save_classifier:
    #    if model.load_classifier(experiment_name):
    #        # Existing saved model loaded
    #        # Also load the corresponding training history
    #        history = read_dict_file("results/"+experiment_name+".txt")

    model.cuda()

    print("Starting experiment  " + experiment_name)
    if test:
        tst = read_data(test_path, fields, max_len=512)
        tst_iter = Iterator(tst,
                            device=device,
                            batch_size=batch_size,
                            sort=False,
                            sort_within_batch=False,
                            repeat=False,
                            train=False)
        batch_predictions = []
        for batch in tst_iter:
            print('.', end='')
            sys.stdout.flush()
            text = batch.text.t()
            with torch.no_grad():
                outputs = model(text,
                                token_positions=batch.token_pos,
                                lemmas=batch.lemma,
                                example_ids=batch.example_id)
                scores = outputs[-1]
            batch_predictions.append(scores.argmax(dim=1))
        batch_preds = torch.cat(batch_predictions, 0).tolist()
        predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds]
        with open("test_predictions/" + experiment_name + ".txt", "w") as out:
            out.write("\n".join(predicted_senses))
    else:
        no_decay = ['bias', 'LayerNorm.weight']
        decay = 0.01
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)

        def save_results(history):
            with open("results/" + experiment_name + ".txt", "w") as out:
                out.write(str(history))
            if save_classifier:
                if len(history['val_acc']) < 2 or history['val_acc'][-1] > max(
                        history['val_acc'][:-1]):
                    model.save_classifier(experiment_name, best=True)
                else:
                    model.save_classifier(experiment_name, best=False)

        train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results,
              history)