def train_sbert_model(model_name,
                      mancon_corpus=False,
                      med_nli=False,
                      multi_nli=False,
                      multi_nli_train_x: np.ndarray = None,
                      multi_nli_train_y: np.ndarray = None,
                      multi_nli_test_x: np.ndarray = None,
                      multi_nli_test_y: np.ndarray = None,
                      med_nli_train_x: np.ndarray = None,
                      med_nli_train_y: np.ndarray = None,
                      med_nli_test_x: np.ndarray = None,
                      med_nli_test_y: np.ndarray = None,
                      man_con_train_y: np.ndarray = None,
                      man_con_train_x: np.ndarray = None,
                      man_con_test_x: np.ndarray = None,
                      man_con_test_y: np.ndarray = None,
                      batch_size: int = 2,
                      num_epochs: int = 1,
                      ):
    """Train SBERT on any NLI dataset.

    :param model_name: model to be used, currently supported: deepset/covid_bert_base or biobert
    :param mancon_corpus: [description], defaults to False
    :type mancon_corpus: bool, optional
    :param med_nli: [description], defaults to False
    :type med_nli: bool, optional
    :param multi_nli: [description], defaults to False
    :type multi_nli: bool, optional
    :param multi_nli_train_x: [description], defaults to None
    :type multi_nli_train_x: np.ndarray, optional
    :param multi_nli_train_y: [description], defaults to None
    :type multi_nli_train_y: np.ndarray, optional
    :param multi_nli_test_x: [description], defaults to None
    :type multi_nli_test_x: np.ndarray, optional
    :param multi_nli_test_y: [description], defaults to None
    :type multi_nli_test_y: np.ndarray, optional
    :param batch_size: [description], defaults to 2
    :type batch_size: int, optional
    :param num_epochs: [description], defaults to 1
    :type num_epochs: int, optional
    :return: [description]
    :rtype: [type]
    """
    if model_name == "deepset/covid_bert_base":
        covid_bert_path = "covid_bert_path"
        model_save_path = covid_bert_path
        os.makedirs(model_save_path, exist_ok=True)
        wget.download("https://cdn.huggingface.co/deepset/covid_bert_base/vocab.txt",
                      out=f"{model_save_path}/")  # download the vocab file

    else:
        model_name = "allenai/biomed_roberta_base"
        model_save_path = "biobert_path"
        os.makedirs(model_save_path, exist_ok=True)
        wget.download("https://cdn.huggingface.co/allenai/biomed_roberta_base/merges.txt",
                      out=f"{model_save_path}/")
        wget.download("https://cdn.huggingface.co/allenai/biomed_roberta_base/vocab.json",
                      out=f"{model_save_path}/")  # download the vocab file

    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.save_pretrained(model_save_path)
    covid_ert_tokenizer = AutoTokenizer.from_pretrained(model_name)
    del bert_model

    word_embedding_model = models.Transformer(model_save_path)
    shutil.rmtree(model_save_path)
    pooling_model = models.Pooling(768,
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=True,
                                   pooling_mode_max_tokens=True)
    # generating biobert sentence embeddings (mean pooling of sentence embedding vectors)
    sbert_model = SBERTPredictor(word_embedding_model, pooling_model)
    if multi_nli:
        if multi_nli_train_x is not None:

            df_multi_train = remove_tokens_get_sentence_sbert(multi_nli_train_x, multi_nli_train_y)
            df_multi_val = remove_tokens_get_sentence_sbert(multi_nli_test_x, multi_nli_test_y)

            multi_train_dataset = ClassifierDataset(df_multi_train, tokenizer=covid_ert_tokenizer)
            multi_val_dataset = ClassifierDataset(df_multi_val, tokenizer=covid_ert_tokenizer)

            class_weights = multi_train_dataset.class_weights()

            train_loader = DataLoader(dataset=multi_train_dataset,
                                      batch_size=batch_size, collate_fn=collate_fn)
            val_loader = DataLoader(dataset=multi_val_dataset, batch_size=1, collate_fn=collate_fn)

            trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader,
                    class_weights=class_weights, epochs=num_epochs)

    if med_nli:
        if med_nli_train_x is not None:

            df_mednli_train = remove_tokens_get_sentence_sbert(med_nli_train_x, med_nli_train_y)
            df_mednli_val = remove_tokens_get_sentence_sbert(med_nli_test_x, med_nli_test_y)

            mednli_train_dataset = ClassifierDataset(df_mednli_train, tokenizer=covid_ert_tokenizer)
            mednli_val_dataset = ClassifierDataset(df_mednli_val, tokenizer=covid_ert_tokenizer)

            class_weights = mednli_train_dataset.class_weights()

            train_loader = DataLoader(dataset=mednli_train_dataset,
                                      batch_size=batch_size, collate_fn=collate_fn)
            val_loader = DataLoader(dataset=mednli_val_dataset, batch_size=1, collate_fn=collate_fn)

            trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader,
                    class_weights=class_weights, epochs=num_epochs)

    if mancon_corpus:
        if man_con_train_x is not None:

            df_mancon_train = remove_tokens_get_sentence_sbert(man_con_train_x, man_con_train_y)
            df_mancon_val = remove_tokens_get_sentence_sbert(man_con_test_x, man_con_test_y)

            mancon_train_dataset = ClassifierDataset(df_mancon_train, tokenizer=covid_ert_tokenizer)
            mancon_val_dataset = ClassifierDataset(df_mancon_val, tokenizer=covid_ert_tokenizer)

            class_weights = mancon_train_dataset.class_weights()

            train_loader = DataLoader(dataset=mancon_train_dataset,
                                      batch_size=batch_size, collate_fn=collate_fn)
            val_loader = DataLoader(dataset=mancon_val_dataset, batch_size=1, collate_fn=collate_fn)

            trainer(model=sbert_model, train_dataloader=train_loader, val_dataloader=val_loader,
                    class_weights=class_weights, epochs=num_epochs)

    return sbert_model
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from torch.nn import CrossEntropyLoss

os.environ["HF_HOME"] = "/scratch/huggingface_cache/"
os.makedirs(f'/scratch/devanshg27/{EXPERIMENT_ID}')

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  #, use_fast=True)

config = AutoConfig.from_pretrained(model_checkpoint)
config.num_labels = 2
# hack to change num_labels of pretrained model (save without classification head, and then add new classification head while loading)
model = AutoModel.from_pretrained(model_checkpoint)
model.save_pretrained(f'/scratch/devanshg27_temp_{EXPERIMENT_ID}')
model = AutoModelForSequenceClassification.from_pretrained(
    f'/scratch/devanshg27_temp_{EXPERIMENT_ID}', config=config)
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    model = torch.nn.DataParallel(model)

model = model.to(device)


class GLUECoSNLIProcessor(processors['xnli']):
    def get_labels(self):
        return ["contradiction", "entailment"]
Example #3
0
    if encoder_type == 'nmt':
        #model_name_or_dir = f'{exp_folder}/hf'

        BATCH_SIZE = 2000
        LAYER_ID = 4

        tokenizer_hf = FSMTTokenizer.from_pretrained(model_name_or_dir)
        model_hf = FSMTForConditionalGeneration.from_pretrained(
            model_name_or_dir)
        model_hf = model_hf.cuda()
        encoder_hf = model_hf.base_model.encoder
        encoder_hf.device = model_hf.device

    elif encoder_type == 'bert':
        #model_name_or_dir = 'xlm-roberta-base'

        BATCH_SIZE = 2000  # probably can do 512
        LAYER_ID = 7

        tokenizer_hf = AutoTokenizer.from_pretrained(model_name_or_dir)
        encoder_hf = AutoModel.from_pretrained(model_name_or_dir)
        encoder_hf = encoder_hf.cuda()

    encoded_sent = extract_reps_sent(data=data,
                                     tokenizer_hf=tokenizer_hf,
                                     encoder_hf=encoder_hf,
                                     batch_size=BATCH_SIZE,
                                     layer_id=LAYER_ID)

    pickle_dump_to_file(encoded_sent, savefile)
Example #4
0
# Code adapted from: https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb

import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from torch import cuda
import sys
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel

LMTokenizer = AutoTokenizer.from_pretrained(sys.argv[1])
LMModel = AutoModel.from_pretrained(sys.argv[1])

device = 'cuda' if cuda.is_available() else 'cpu'

train_dataset = pd.read_csv('./train.csv',
                            sep=',',
                            names=['CGT', 'CDT', 'CC', 'label'])
testing_dataset = pd.read_csv('./validation.csv',
                              sep=',',
                              names=['CGT', 'CDT', 'CC', 'label'])

MAX_LEN = 512
TRAIN_BATCH_SIZE = int(sys.argv[2])
VALID_BATCH_SIZE = int(sys.argv[2])
LEARNING_RATE = float(sys.argv[3])
drop_out = float(sys.argv[4])
EPOCHS = 10
tokenizer = LMTokenizer
Example #5
0
    def __init__(self, config, device, num_genres=None):
        super().__init__()
        self.config = config
        self.device = device

        self.num_genres = num_genres if num_genres else len(config['genres'])
        self.max_seg_len = config['max_segment_len']
        self.max_span_width = config['max_span_width']
        assert config['loss_type'] in ['marginalized', 'hinge']
        if config['coref_depth'] > 1 or config[
                'higher_order'] == 'cluster_merging':
            assert config[
                'fine_grained']  # Higher-order is in slow fine-grained scoring

        # Model
        self.dropout = nn.Dropout(p=config['dropout_rate'])

        if config['hidden_dropout_prob'] >= 0:
            self.bert = AutoModel.from_pretrained(
                config['bert_pretrained_name_or_path'],
                hidden_dropout_prob=config['hidden_dropout_prob'])
        else:
            self.bert = AutoModel.from_pretrained(
                config['bert_pretrained_name_or_path'])

        self.bert_emb_size = self.bert.config.hidden_size
        self.span_emb_size = self.bert_emb_size * 3
        if config['use_features']:
            self.span_emb_size += config['feature_emb_size']
        self.pair_emb_size = self.span_emb_size * 3
        if config['use_metadata']:
            self.pair_emb_size += 2 * config['feature_emb_size']
        if config['use_features']:
            self.pair_emb_size += config['feature_emb_size']
        if config['use_segment_distance']:
            self.pair_emb_size += config['feature_emb_size']

        self.emb_span_width = self.make_embedding(
            self.max_span_width) if config['use_features'] else None
        self.emb_span_width_prior = self.make_embedding(
            self.max_span_width) if config['use_width_prior'] else None
        self.emb_antecedent_distance_prior = self.make_embedding(
            10) if config['use_distance_prior'] else None
        self.emb_genre = self.make_embedding(self.num_genres)
        self.emb_same_speaker = self.make_embedding(
            2) if config['use_metadata'] else None
        self.emb_segment_distance = self.make_embedding(
            config['max_training_sentences']
        ) if config['use_segment_distance'] else None
        self.emb_top_antecedent_distance = self.make_embedding(10)
        self.emb_cluster_size = self.make_embedding(
            10) if config['higher_order'] == 'cluster_merging' else None

        self.mention_token_attn = self.make_ffnn(
            self.bert_emb_size, 0,
            output_size=1) if config['model_heads'] else None
        self.span_emb_score_ffnn = self.make_ffnn(
            self.span_emb_size, [config['ffnn_size']] * config['ffnn_depth'],
            output_size=1)
        self.span_width_score_ffnn = self.make_ffnn(
            config['feature_emb_size'], [config['ffnn_size']] *
            config['ffnn_depth'],
            output_size=1) if config['use_width_prior'] else None
        self.coarse_bilinear = self.make_ffnn(self.span_emb_size,
                                              0,
                                              output_size=self.span_emb_size)
        self.antecedent_distance_score_ffnn = self.make_ffnn(
            config['feature_emb_size'], 0,
            output_size=1) if config['use_distance_prior'] else None
        self.coref_score_ffnn = self.make_ffnn(
            self.pair_emb_size, [config['ffnn_size']] * config['ffnn_depth'],
            output_size=1) if config['fine_grained'] else None

        self.gate_ffnn = self.make_ffnn(
            2 * self.span_emb_size, 0, output_size=self.span_emb_size
        ) if config['coref_depth'] > 1 else None
        self.span_attn_ffnn = self.make_ffnn(
            self.span_emb_size, 0, output_size=1
        ) if config['higher_order'] == 'span_clustering' else None
        self.cluster_score_ffnn = self.make_ffnn(
            3 * self.span_emb_size +
            config['feature_emb_size'], [config['cluster_ffnn_size']] *
            config['ffnn_depth'],
            output_size=1
        ) if config['higher_order'] == 'cluster_merging' else None

        self.update_steps = 0  # Internal use for debug
        self.debug = False
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from torch import cuda
import csv
import torch

device = 'cuda' if cuda.is_available() else 'cpu'

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased').to(
    device)

file = open('train.csv')
cr = csv.reader(file)
lines = list(cr)
file.close()

xtrain = []
ytrain = []

for i in lines:
    input_ids = torch.tensor(tokenizer.encode(i[-2])).unsqueeze(0)
    outputs = model(input_ids.to(device))
    last_hidden_states = outputs[0]
    xtrain.append(last_hidden_states.tolist()[0][0])
    ytrain.append(int(i[-1]))

xtest = []
ytest = []
data = pd.read_csv(
    r"E:\Projects\Emotion_detection_gihan\finbert_experiments\financial phrasebank\processed_fpbank.csv"
)

sentences = list(data["sentence"])
labels = list(data["sentiment_id"])
#Sentences we want sentence embeddings for
# sentences = ['This framework generates embeddings for each input sentence',
#              'Sentences are passed as a list of string.',
#              'The quick brown fox jumps over the lazy dog.']
model_name = "ProsusAI/finbert"
# model_name = "bert-base-uncased"
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

#Tokenize sentences
encoded_input = tokenizer(sentences,
                          padding=True,
                          truncation=True,
                          max_length=128,
                          return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output,
                                   encoded_input['attention_mask'])
Example #8
0
def _compute_pytorch(model_names, dictionary, average_over, device,
                     torchscript, fp16):
    for c, model_name in enumerate(model_names):
        print(f"{c + 1} / {len(model_names)}")
        config = AutoConfig.from_pretrained(model_name,
                                            torchscript=torchscript)
        model = AutoModel.from_pretrained(model_name, config=config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        tokenized_sequence = tokenizer.encode(input_text,
                                              add_special_tokens=False)

        max_input_size = tokenizer.max_model_input_sizes[model_name]
        batch_sizes = [1, 2, 4, 8]
        slice_sizes = [8, 64, 128, 256, 512, 1024]

        dictionary[model_name] = {
            "bs": batch_sizes,
            "ss": slice_sizes,
            "results": {}
        }
        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}

        for batch_size in batch_sizes:
            if fp16:
                model.half()
            model.to(device)
            model.eval()
            for slice_size in slice_sizes:
                if max_input_size is not None and slice_size > max_input_size:
                    dictionary[model_name]["results"][batch_size][
                        slice_size] = "N/A"
                else:
                    sequence = torch.tensor(tokenized_sequence[:slice_size],
                                            device=device).repeat(
                                                batch_size, 1)
                    try:
                        if torchscript:
                            print("Tracing model with sequence size",
                                  sequence.shape)
                            inference = torch.jit.trace(model, sequence)
                            inference(sequence)
                        else:
                            inference = model
                            inference(sequence)

                        print("Going through model with sequence of shape",
                              sequence.shape)
                        runtimes = timeit.repeat(lambda: inference(sequence),
                                                 repeat=average_over,
                                                 number=3)
                        average_time = sum(runtimes) / float(
                            len(runtimes)) / 3.0
                        dictionary[model_name]["results"][batch_size][
                            slice_size] = average_time
                    except RuntimeError as e:
                        print("Doesn't fit on GPU.", e)
                        torch.cuda.empty_cache()
                        dictionary[model_name]["results"][batch_size][
                            slice_size] = "N/A"
    return dictionary
Example #9
0
import pickle, os.path, pandas as pd
# Modules for Scraping
from importlib import import_module
# Modules for Running Predictions
import sys, time, os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import AutoModel, BertTokenizerFast
import torch
import torch.nn as nn
import numpy as np
import time
import sqlite3 as sql

# These are constant variables
device = torch.device("cpu")
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


class BERT_Arch(nn.Module):
    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        self.bert = bert
        # dropout layer
        self.dropout = nn.Dropout(0.2)
        # relu activation function
        self.relu = nn.ReLU()
        # dense layer 1
        self.fc1 = nn.Linear(768, 512)
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512, 2)
Example #10
0
def load_pretrained_model(model_name, config, cache_dir):
    if model_name in ["gpt2", "distilgpt2", "gpt2-large"]:
        return GPT2ModelNoPastState.from_pretrained(model_name, config=config, cache_dir=cache_dir)
    return AutoModel.from_pretrained(model_name, config=config, cache_dir=cache_dir)
Example #11
0
    FairseqDropout,
    LayerDropModuleList,
    LayerNorm,
    PositionalEmbedding,
    SinusoidalPositionalEmbedding,
    TransformerDecoderLayer,
    TransformerEncoderLayer,
)
from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
from torch import Tensor

from transformers import AutoModel

DEFAULT_MAX_SOURCE_POSITIONS = 1024
DEFAULT_MAX_TARGET_POSITIONS = 1024
Pretrained_model = AutoModel.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English")
Pretrained_model.eval()


@register_model("transformer")
class TransformerModel(FairseqEncoderDecoderModel):
    """
    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
    <https://arxiv.org/abs/1706.03762>`_.

    Args:
        encoder (TransformerEncoder): the encoder
        decoder (TransformerDecoder): the decoder

    The Transformer model provides the following named architectures and
    command-line arguments:
# print("nonzero_index", nonzero_index)

# nonzero_index = nonzero_index.squeeze(1)
# print()
# print("nonzero_index", list(nonzero_index.numpy()))

# exit()

### load the pretrained model
output_dir = "/tmp/test-mlm-wwm"

config_file = os.path.join(output_dir, "tokenizer_config.json")
model_path = os.path.join(output_dir)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained(output_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

### load the data

### load sentences
dataset_path = "/p/reviewde/data/ratebeer/sentences"
id2sent_file = "id_to_sent.pickle"

max_corpus_size = 100
embedding_cache_path = 'ratebeer-embeddings-size-{}.pkl'.format(
    max_corpus_size)

# #Check if embedding cache path exists
Example #13
0
    args = parser.parse_args()

    # Set defaults ------------------------------------------------------------
    if torch.cuda.is_available():
        args.device = torch.device('cuda')
    else:
        args.device = torch.device('cpu')

    # Read MedMentions trn/dev/tst splits
    split_trn = [p.rstrip() for p in open(cfg.MM_PMIDS_TRN).readlines()]
    split_dev = [p.rstrip() for p in open(cfg.MM_PMIDS_DEV).readlines()]
    split_tst = [p.rstrip() for p in open(cfg.MM_PMIDS_TST).readlines()]

    # Load pretrained BERT model
    tokenizer = AutoTokenizer.from_pretrained(cfg.BERT_MODEL)
    encoder = AutoModel.from_pretrained(cfg.BERT_MODEL,
                                        output_hidden_states=True)
    encoder.to(args.device)
    encoder.eval()

    # Read CUIs
    UMLS = Entities()

    # Read and convert MedMentions annotation examples
    examples = read_mm_examples()

    # Exclude the CUIs that do not have any name associated with
    print('=> Deleting all the CUIs without a name')
    to_delete = []
    total = len(UMLS.cuis)
    for cui, e in UMLS.cuis.items():
        if len(e.names) == 0:
Example #14
0
 def __init__(self, model_name_or_path, temperature=0.05, pooling="mean"):
     super().__init__(temperature)
     self.bert_model = AutoModel.from_pretrained(model_name_or_path)
     self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
     self.pooling = pooling
Example #15
0
from train_model import *

from pprint import pprint

model = "craftassist/agent/models/semantic_parser/ttad_bert_updated/caip_test_model.pth"
args_path = "craftassist/agent/models/semantic_parser/ttad_bert_updated/caip_test_model_args.pk"
args = pickle.load(open(args_path, "rb"))

tokenizer = AutoTokenizer.from_pretrained(args.pretrained_encoder_name)
full_tree, tree_i2w = json.load(open(args.tree_voc_file))
dataset = CAIPDataset(tokenizer,
                      args,
                      prefix="",
                      full_tree_voc=(full_tree, tree_i2w))

enc_model = AutoModel.from_pretrained(args.pretrained_encoder_name)
bert_config = BertConfig.from_pretrained("bert-base-uncased")
bert_config.is_decoder = True
bert_config.add_cross_attention = True
bert_config.vocab_size = len(tree_i2w) + 8
bert_config.num_hidden_layers = args.num_decoder_layers
dec_with_loss = DecoderWithLoss(bert_config, args, tokenizer)
encoder_decoder = EncoderDecoderWithLoss(enc_model, dec_with_loss, args)
map_location = None if torch.cuda.is_available() else torch.device("cpu")
encoder_decoder.load_state_dict(torch.load(model, map_location=map_location),
                                strict=False)
encoder_decoder = encoder_decoder.cuda()
_ = encoder_decoder.eval()


def get_beam_tree(chat, noop_thres=0.95, beam_size=5, well_formed_pen=1e2):
Example #16
0
    def __init__(self, config, args):
        super().__init__()

        self.encoder = AutoModel.from_pretrained(args.model_name)
Example #17
0
 def __init__(self, pretrained):
     super().__init__()
     self.model = AutoModel.from_pretrained(pretrained)
Example #18
0
parser.add_argument(
    '--random_init',
    action='store_true',
    default=False,
    help='Boolean indication whether to randomly initialize the model.')

args = parser.parse_args()
print(args)

print('Extracting Features')

tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_name)
config = AutoConfig.from_pretrained(args.pretrained_model_name,
                                    output_hidden_states=True)
if args.random_init:  # random initialization of the model
    model = AutoModel.from_config(config)
else:
    model = AutoModel.from_pretrained(args.pretrained_model_name,
                                      config=config)

manifold_vectors = defaultdict(dict)
with open(args.tag_file) as f:
    for tag in f:
        tag = tag.strip().lower()
        for layer in range(1, config.num_hidden_layers + 1):
            manifold_vectors[layer][tag] = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
Example #19
0
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer

torch.set_grad_enabled(False)

# Store the model we want to use
MODEL_NAME = "bert-base-cased"

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

############1. basic usage############
# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties.
tokens = tokenizer.tokenize("This is an input example")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input,
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
Example #20
0
}

model_name = st.selectbox(
    "Select the model",
    ('scibert-nli', 'biobert-nli', 'covidbert-nli', 'clinicalcovidbert-nli'),
    index=2)

'#### Selected model:', model_name
EMBEDDINGS_PATH = f'{model_name}-embeddings.pkl'

path = os.path.join(MODELS_DIR, model_name)
if not os.path.exists(path):
    os.makedirs(path)

tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name])
model = AutoModel.from_pretrained(MODELS[model_name])
model.save_pretrained(path)
tokenizer.save_pretrained(path)

word_embedding_model = models.BERT(path,
                                   max_seq_length=512,
                                   do_lower_case=True)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
rmtree(path)
model.save(path)
Example #21
0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

data_dir = 'data-aug-wsd/data/'
saved_model_dir = 'data-aug-wsd/saved_models/'
data_file = 'train_data_reduced_augmented_le5.jsonl'

experiment_dir = saved_model_dir + '/' + experiment_name + '-' + experiment_number + '/'

if __name__ == "__main__":

    if not os.path.exists(experiment_dir):
        os.mkdir(experiment_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    context_encoder = AutoModel.from_pretrained(model_name).to(device)
    sense_encoder = AutoModel.from_pretrained(model_name).to(device)

    context_encoder_optimizer = AdamW(context_encoder.parameters(),
                                      lr=context_encoder_lr,
                                      correct_bias=False)
    sense_encoder_optimizer = AdamW(sense_encoder.parameters(),
                                    lr=sense_encoder_lr,
                                    correct_bias=False)
    cos_sim = torch.nn.CosineSimilarity()

    with open(data_dir + data_file, 'r') as f:
        data = f.readlines()
    data = list(map(lambda x: json.loads(x.rstrip('\n')), data))
    data = np.array(data)
Example #22
0
LANG = "en"

try:
    os.remove(f"./all_{LANG}/content_{LANG}.txt")
except FileNotFoundError:
    pass

for file in glob.iglob(f"./{LANG}/*"):
    os.remove(file)

# model = "illuin/camembert-large-fquad"
model = "camembert-base"
# model = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model)
# bertizer = AutoModelForQuestionAnswering.from_pretrained(model)
bertizer = AutoModel.from_pretrained(model)

with open("covid_raw.json", "r") as file:
    dico = json.load(file)

dico_splitted = {}
for source, sub_dic in track(dico.items(), description="Entries..."):
    try:
        raw_text_fr = sub_dic["content_fr"]
        title_fr = sub_dic["title_fr"]
    except KeyError:
        continue

    splited_words_fr = np.array(raw_text_fr.split(" "))
    splitted_chunk_words_fr = np.array_split(
        splited_words_fr, (len(splited_words_fr) // 200) + 1)
Example #23
0
 def build_transformer_base(self):
     """Build the transformer base model.
     """
     self.transformer = AutoModel.from_pretrained(
         self.config.transformer_base, config=self.transformer_config)
Example #24
0
from transformers import AutoTokenizer, AutoModel
import os

# define the name of the directory to be created
DIRECTORY = "./bert-base-cased"

try:
    os.mkdir(DIRECTORY)
except OSError:
    print("Creation of the directory %s failed" % DIRECTORY)
else:
    print("Successfully created the directory %s " % DIRECTORY)

AutoTokenizer.from_pretrained("bert-base-cased").save_pretrained(DIRECTORY)
AutoModel.from_pretrained("bert-base-cased").save_pretrained(DIRECTORY)
Example #25
0
 def build_model(self):
     config = AutoConfig.from_pretrained(self.hparams.pretrain,
                                         output_hidden_states=True)
     model = AutoModel.from_pretrained(self.hparams.pretrain, config=config)
     return model
Example #26
0
 def __init__(self, num_labels):
     super(TextClassifier, self).__init__()
     self.pretrained_model = AutoModel.from_pretrained(model_checkpoint)
     self.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                     nn.Linear(256, num_labels), nn.ReLU())
Example #27
0
def main():
    # 1. Parse input arguments
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # 2. Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # 3. Detecting last checkpoint and eventualy continue from last checkpoint
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # 4. Load dataset
    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files this script will use the first column for the full image path and the second column for the
    # captions (unless you specify column names for this with the `image_column` and `caption_column` arguments).
    #
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        dataset = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            keep_in_memory=False,
            data_dir=data_args.data_dir,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # 5. Load pretrained model, tokenizer, and feature extractor
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    # Load feature_extractor, in this script we only use this to get the mean and std for normalization.
    feature_extractor = AutoFeatureExtractor.from_pretrained(
        model_args.feature_extractor_name or model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    model = AutoModel.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    config = model.config

    def _freeze_params(module):
        for param in module.parameters():
            param.requires_grad = False

    if model_args.freeze_vision_model:
        _freeze_params(model.vision_model)

    if model_args.freeze_text_model:
        _freeze_params(model.text_model)

    # set seed for torch dataloaders
    set_seed(training_args.seed)

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = dataset["train"].column_names
    elif training_args.do_eval:
        column_names = dataset["validation"].column_names
    elif training_args.do_predict:
        column_names = dataset["test"].column_names
    else:
        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
        return

    # 6. Get the column names for input/target.
    dataset_columns = dataset_name_mapping.get(data_args.dataset_name, None)
    if data_args.image_column is None:
        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
    else:
        image_column = data_args.image_column
        if image_column not in column_names:
            raise ValueError(
                f"--image_column' value '{data_args.image_column}' needs to be one of: {', '.join(column_names)}"
            )
    if data_args.caption_column is None:
        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
    else:
        caption_column = data_args.caption_column
        if caption_column not in column_names:
            raise ValueError(
                f"--caption_column' value '{data_args.caption_column}' needs to be one of: {', '.join(column_names)}"
            )

    # 7. Preprocessing the datasets.
    # Initialize torchvision transforms and jit it for faster processing.
    image_transformations = Transform(
        config.vision_config.image_size, feature_extractor.image_mean, feature_extractor.image_std
    )
    image_transformations = torch.jit.script(image_transformations)

    # Preprocessing the datasets.
    # We need to tokenize input captions and transform the images.
    def tokenize_captions(examples):
        captions = [caption for caption in examples[caption_column]]
        text_inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True)
        examples["input_ids"] = text_inputs.input_ids
        examples["attention_mask"] = text_inputs.attention_mask
        return examples

    def transform_images(examples):
        images = [read_image(image_file, mode=ImageReadMode.RGB) for image_file in examples[image_column]]
        examples["pixel_values"] = [image_transformations(image) for image in images]
        return examples

    def filter_corrupt_images(examples):
        """remove problematic images"""
        valid_images = []
        for image_file in examples[image_column]:
            try:
                Image.open(image_file)
                valid_images.append(True)
            except Exception:
                valid_images.append(False)
        return valid_images

    if training_args.do_train:
        if "train" not in dataset:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = dataset["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))

        train_dataset = train_dataset.filter(
            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
        )
        train_dataset = train_dataset.map(
            function=tokenize_captions,
            batched=True,
            remove_columns=[col for col in column_names if col != image_column],
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on train dataset",
        )

        # Transform images on the fly as doing it on the whole dataset takes too much time.
        train_dataset.set_transform(transform_images)

    if training_args.do_eval:
        if "validation" not in dataset:
            raise ValueError("--do_eval requires a train validation")
        eval_dataset = dataset["validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))

        eval_dataset = eval_dataset.filter(
            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
        )
        eval_dataset = eval_dataset.map(
            function=tokenize_captions,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[col for col in column_names if col != image_column],
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on validation dataset",
        )

        # Transform images on the fly as doing it on the whole dataset takes too much time.
        eval_dataset.set_transform(transform_images)

    if training_args.do_predict:
        if "test" not in dataset:
            raise ValueError("--do_predict requires a test dataset")
        test_dataset = dataset["test"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
            test_dataset = test_dataset.select(range(max_eval_samples))

        test_dataset = test_dataset.filter(
            filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
        )
        test_dataset = test_dataset.map(
            function=tokenize_captions,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[col for col in column_names if col != image_column],
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on test dataset",
        )

        # Transform images on the fly as doing it on the whole dataset takes too much time.
        test_dataset.set_transform(transform_images)

    # 8. Initalize our trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        data_collator=collate_fn,
    )

    # 9. Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()
        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        trainer.save_state()

    # 10. Evaluation
    if training_args.do_eval:
        metrics = trainer.evaluate()
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # 11. Write Training Stats and push to hub.
    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "contrastive-image-text-modeling"}
    if data_args.dataset_name is not None:
        kwargs["dataset_tags"] = data_args.dataset_name
        if data_args.dataset_config_name is not None:
            kwargs["dataset_args"] = data_args.dataset_config_name
            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
        else:
            kwargs["dataset"] = data_args.dataset_name

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
Example #28
0
def transformers(
    path_to_senteval: str,
    pretrained_model_name_or_path: str,
    output_filepath: str = None,
    mean_pool: bool = False,
    cuda_device: int = -1,
    prototyping_config: bool = False,
    verbose: bool = False,
) -> None:
    """Evaluates a pre-trained model from the Transformers library against the SentEval benchmark."""

    from transformers import AutoModel, AutoTokenizer

    # SentEval prepare and batcher
    def prepare(params, samples):
        return

    @torch.no_grad()
    def batcher(params, batch):
        batch = _cleanup_batch(batch)
        # Re-tokenize the input text using the pre-trained tokenizer
        batch = [" ".join(tokens) for tokens in batch]
        # HACK (John): This will save us in the case of tokenizers with no default max_length
        # Why does this happen? Open an issue on Transformers.
        max_length = params.tokenizer.max_length if hasattr(
            tokenizer, "max_length") else 512
        inputs = params.tokenizer.batch_encode_plus(batch,
                                                    pad_to_max_length=True,
                                                    max_length=max_length,
                                                    return_tensors="pt")
        # Place all input tensors on same device as the model
        inputs = {
            name: tensor.to(params.device)
            for name, tensor in inputs.items()
        }

        sequence_output, pooled_output = model(**inputs)

        # If mean_pool, we take the average of the token-level embeddings, accounting for pads.
        # Otherwise, we take the pooled output for this specific model, which is typically the
        # embedding of a special tokens embedding, like [CLS] or <s>, which is prepended to the
        # input during tokenization.
        if mean_pool:
            embeddings = torch.sum(
                sequence_output * inputs["attention_mask"].unsqueeze(-1),
                dim=1) / torch.clamp(torch.sum(
                    inputs["attention_mask"], dim=1, keepdims=True),
                                     min=1e-9)
        else:
            embeddings = pooled_output
        embeddings = embeddings.cpu().numpy()

        return embeddings

    # Determine the torch device
    device = _get_device(cuda_device)

    # Load the Transformers tokenizer
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    typer.secho(
        (f"{SUCCESS} Tokenizer '{pretrained_model_name_or_path}' from Transformers loaded"
         " successfully."),
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Load the Transformers model
    model = AutoModel.from_pretrained(pretrained_model_name_or_path)
    model.to(device)
    model.eval()
    typer.secho(
        f'{SUCCESS} Model "{pretrained_model_name_or_path}" from Transformers loaded successfully.',
        fg=typer.colors.GREEN,
        bold=True,
    )

    # Performs a few setup steps and returns the SentEval params
    params_senteval = _setup_senteval(path_to_senteval, prototyping_config,
                                      verbose)
    params_senteval["tokenizer"] = tokenizer
    params_senteval["model"] = model
    params_senteval["device"] = device
    _run_senteval(params_senteval, path_to_senteval, batcher, prepare,
                  output_filepath)

    return
Example #29
0
    def test_rag_sequence_from_pretrained(self):
        rag_config = self.get_rag_config()
        rag_decoder_tokenizer = BartTokenizer.from_pretrained(
            "facebook/bart-large-cnn")
        rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")
        rag_retriever = RagRetriever(
            rag_config,
            question_encoder_tokenizer=rag_question_encoder_tokenizer,
            generator_tokenizer=rag_decoder_tokenizer,
        )

        input_ids = rag_question_encoder_tokenizer(
            "who sings does he love me with reba",
            return_tensors="pt").input_ids
        decoder_input_ids = rag_decoder_tokenizer(
            "Linda Davis", return_tensors="pt").input_ids

        input_ids = input_ids.to(torch_device)
        decoder_input_ids = decoder_input_ids.to(torch_device)

        with tempfile.TemporaryDirectory() as tmp_dirname:
            rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator(
                "facebook/dpr-question_encoder-single-nq-base",
                "facebook/bart-large-cnn",
                retriever=rag_retriever,
                config=rag_config,
            ).to(torch_device)
            # check that the from pretrained methods work
            rag_sequence.save_pretrained(tmp_dirname)
            rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever)
            rag_sequence.to(torch_device)

            with torch.no_grad():
                output = rag_sequence(
                    input_ids,
                    labels=decoder_input_ids,
                )

            loss_pretrained = output.loss
            del rag_sequence

        question_encoder = AutoModel.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")
        generator = AutoModelForSeq2SeqLM.from_pretrained(
            "facebook/bart-large-cnn")
        rag_sequence = RagSequenceForGeneration(
            config=rag_config,
            question_encoder=question_encoder,
            generator=generator,
            retriever=rag_retriever)
        rag_sequence.to(torch_device)

        with torch.no_grad():
            output = rag_sequence(
                input_ids,
                labels=decoder_input_ids,
            )

        loss_init = output.loss

        self.assertAlmostEqual(loss_pretrained.item(),
                               loss_init.item(),
                               places=4)
Example #30
0
    def __init__(self,
                 classifier_dims,
                 num_classes,
                 gaussian_noise,
                 dropout,
                 internal_dims,
                 n_layers,
                 featurizer,
                 n_tokens_in=64,
                 n_tokens_out=16,
                 use_as_super=False,
                 **kwargs):
        embedding_dims = 768
        super(AlbertClassifer,
              self).__init__(classifier_dims, num_classes, embedding_dims,
                             gaussian_noise, dropout, internal_dims, n_layers,
                             featurizer, final_layer_builder, n_tokens_in,
                             n_tokens_out, True, **kwargs)
        self.word_masking_proba = kwargs[
            "word_masking_proba"] if "word_masking_proba" in kwargs else 0.0

        if not use_as_super:
            model = kwargs["model"] if "model" in kwargs else 'albert-base-v2'
            global_dir = get_global("models_dir")
            model = os.path.join(
                global_dir,
                model) if model in os.listdir(global_dir) else model
            self.tokenizer = AutoTokenizer.from_pretrained(model)
            self.model = AutoModel.from_pretrained(model)
            print("Pick stored Model", model, "Model Class = ",
                  type(self.model), "Tokenizer Class = ", type(self.tokenizer))
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims,
                                                n_tokens_out, classifier_dims,
                                                internal_dims, n_layers,
                                                gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs[
                    "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(
                    n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                    internal_dims, n_encoders, n_decoders, gaussian_noise,
                    dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = fb_1d_loss_builder(classifier_dims,
                                                  n_tokens_out, num_classes,
                                                  dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.word_masking = WordMasking(tokenizer=self.tokenizer, **kwargs)
        self.reg_layers = get_regularization_layers(self)