コード例 #1
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
コード例 #2
0
    def load_model_tokenizer(self, pretrained):
        """ Load transformer model and tokenizer for given pre-trained name 
        
        :param pretrained: pre-trained name
        :return: model, tokenizer
        """
        
        model = None
        tokenizer = None
        
        if self.method == "T5":
            if pretrained in T5_PRETRAINED_MODELS:
                model = T5ForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = T5Tokenizer.from_pretrained(pretrained)
        elif self.method == "BART":
            if pretrained in BART_PRETRAINED_MODELS:
                model = BartForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = BartTokenizer.from_pretrained(pretrained)
        elif self.method == "GPT-2":
            if pretrained in GPT2_PRETRAINED_MODELS:
                model = GPT2LMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
        elif self.method == "XLM":
            if pretrained in XLM_PRETRAINED_MODELS:
                model = XLMWithLMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = XLMTokenizer.from_pretrained(pretrained)
        else:
            pass

        return model, tokenizer
コード例 #3
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
コード例 #4
0
def get_attentions():
    model_name = request.args.get('model')
    source = request.args.get('source')
    target = request.args.get('target')

    if model_name == 'XLM':
        model_version = 'xlm-mlm-ende-1024'
        model = XLMModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = XLMTokenizer.from_pretrained(model_version)
    elif model_name == 'GPT-2':
        model_version = 'gpt2'
        model = GPT2Model.from_pretrained(model_version, output_attentions=True)
        tokenizer = GPT2Tokenizer.from_pretrained(model_version)
    else:
        # BERT
        model_version = 'bert-base-uncased'
        model = BertModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)

    inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True)
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention = model(input_ids, token_type_ids=token_type_ids)[-1]
    input_id_list = input_ids[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
コード例 #5
0
    def __init__(
        self,
        pretrained_embedding=None,
        architecture_function=None,
        text_input_column="clean_text",
        meta_input_list=("extension", "dayofweek", "hour", "min"),
        vocab_size=25000,
        seq_size=100,
        embedding_dim=200,
        loss="categorical_crossentropy",
        activation="softmax",
        batch_size=4096,
        n_epochs=15,
        bert_tokenizer="jplu/tf-camembert-base",
        bert_model="jplu/tf-camembert-base",
        **kwargs,
    ):
        self.architecture_function = architecture_function
        self.pretrained_embedding = pretrained_embedding
        if self.architecture_function.__name__ != "bert_model":
            self.tokenizer = Tokenizer(input_column=text_input_column)
        elif "camembert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import CamembertTokenizer

                self.tokenizer = CamembertTokenizer.from_pretrained(
                    bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        elif "flaubert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import XLMTokenizer

                self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        else:
            raise NotImplementedError(
                "Bert tokenizer {} not implemented".format(bert_tokenizer))
        self.text_input_column = text_input_column
        self.meta_input_list = meta_input_list
        self.vocab_size = vocab_size
        self.seq_size = seq_size
        self.embedding_dim = embedding_dim
        self.loss = loss
        self.activation = activation
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.bert_model = bert_model
        self.nb_labels = 0
        self.nb_meta_features = 0
        self.vocabulary = []
        self.vocabulary_dict = {}
コード例 #6
0
 def _test_TFXLM(self, size, large=False):
     from transformers import TFXLMModel, XLMTokenizer
     tokenizer = XLMTokenizer.from_pretrained(size)
     model = TFXLMModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["last_hidden_state"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, atol=0.005)
コード例 #7
0
def test_model(modelname):
    model, log = XLMModel.from_pretrained(modelname, output_loading_info=True)
    tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False)

    # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag
    tokenizer.do_lowercase_and_remove_accent = False
    print("Dictionary values must be empty lists:")
    print(log)
コード例 #8
0
def test_space_tokenization_and_xlm_uncased_tokenization_normalization():
    text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ."
    space_tokenized = text.split(" ")
    tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
    target_tokenized = tokenizer.tokenize(text)
    normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations(
        space_tokenized, target_tokenized, tokenizer)
    assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
コード例 #9
0
ファイル: utils.py プロジェクト: DS3Lab/multilingual-gaze
def create_tokenizer(bert_pretrained):
    """
    Wrapper function returning a tokenizer for BERT.
    """
    if bert_pretrained.startswith("xlm"):
        return XLMTokenizer.from_pretrained(bert_pretrained)
    else:
        return BertTokenizer.from_pretrained(bert_pretrained)
コード例 #10
0
ファイル: utils.py プロジェクト: dabbler0/global-model-repr
def get_model_and_tokenizer(model_name, device, random_weights=False):

    model_name = model_name

    if model_name.startswith('xlnet'):
        model = XLNetModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        sep = u'▁'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('gpt2'):
        model = GPT2Model.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        sizes = {
            "gpt2": 768,
            "gpt2-medium": 1024,
            "gpt2-large": 1280,
            "gpt2-xl": 1600
        }
        emb_dim = sizes[model_name]
    elif model_name.startswith('xlm'):
        model = XLMModel.from_pretrained(model_name,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(model_name)
        sep = '</w>'
    elif model_name.startswith('bert'):
        model = BertModel.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('distilbert'):
        model = DistilBertModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 768
    elif model_name.startswith('roberta'):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        emb_dim = 1024 if "large" in model_name else 768
    else:
        print('Unrecognized model name:', model_name)
        sys.exit()

    if random_weights:
        print('Randomizing weights')
        model.init_weights()

    return model, tokenizer, sep, emb_dim
コード例 #11
0
ファイル: x.py プロジェクト: zeta1999/CoSDA-ML
 def init(args):
     BERTTool.multi_bert = XLMModel.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_tokener = XLMTokenizer.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<pad>"])[0]
     BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["</s>"])[0]
     BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<s>"])[0]
コード例 #12
0
 def __init__(self):
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu"
     )
     self.tokenizer = XLMTokenizer.from_pretrained(
         'allegro/herbert-klej-cased-tokenizer-v1'
     )
     self.model = RobertaModel.from_pretrained(
         'allegro/herbert-klej-cased-v1'
     )
     self.model = self.model.to(self.device)
コード例 #13
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
コード例 #14
0
 def test_TFXLMForQuestionAnsweringSimple(self):
     from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
     pretrained_weights = 'xlm-mlm-enfr-1024'
     tokenizer = XLMTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFXLMForQuestionAnsweringSimple.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
コード例 #15
0
def get_model_and_tokenizer(model_name,
                            device="cpu",
                            random_weights=False,
                            model_path=None):
    """
    model_path: if given, initialize from path instead of official repo
    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep
 def __init__(self, model_type):
     """Constructor
     :param model_type: which model is used, xlm or mbert
     """
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         model = XLMModel.from_pretrained('xlm-mlm-100-1280')
         self.embeddings = model.embeddings.weight
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
         model = BertModel.from_pretrained('bert-base-multilingual-uncased')
         self.embeddings = model.embeddings.word_embeddings.weight
     self.emb_dim = self.embeddings.shape[1]
コード例 #17
0
 def test_TFXLMWithLMHeadModel(self):
     from transformers import XLMTokenizer, TFXLMWithLMHeadModel
     pretrained_weights = 'xlm-mlm-enfr-1024'
     tokenizer = XLMTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFXLMWithLMHeadModel.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name,
                          onnx_model,
                          inputs_onnx,
                          predictions,
                          self.model_files,
                          rtol=1.e-2,
                          atol=1.e-4))
 def __init__(self, model_type):
     """Constructor
     :param model_type: if and xlm or bert model is used
     """
     # Instantiate model and tokenizers from pre-trained multilingual versions
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         self.model = XLMModel.from_pretrained('xlm-mlm-100-1280',
                                               output_hidden_states=True)
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-uncased')
         self.model = BertModel.from_pretrained(
             'bert-base-multilingual-uncased', output_hidden_states=True)
     else:
         raise ValueError(
             'Unrecognized model type. Only bert and xlm supported')
コード例 #19
0
def get_embedding_for_text(text: str) -> (torch.tensor, torch.tensor):
    """
    For a given sentence the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentnece, average embedding of a sentence)
    """
    tokenizer = XLMTokenizer.from_pretrained(
        join(dirname(realpath(__file__)), "models", "tokenizer"))
    bert_model = RobertaModel.from_pretrained(
        join(dirname(realpath(__file__)), "models", "bert"))

    encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
    outputs = bert_model(encoded_input)

    sequence_tokens_embedding = outputs[0].squeeze(dim=0)
    sentence_embedding = outputs[1].squeeze(dim=0)
    return sequence_tokens_embedding, sentence_embedding
コード例 #20
0
def convert_id_to_token(indexed_tokens, model_name):

    if model_name == "bert":
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    elif model_name == "xlnet":
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    elif model_name == "xlm":
        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
    elif model_name == "electra":
        tokenizer = ElectraTokenizer.from_pretrained(
            "google/electra-small-discriminator")
    elif model_name == "albert":
        tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

    word_tokens = [
        tokenizer.convert_ids_to_tokens(indexed_token)
        for indexed_token in indexed_tokens
    ]
    return word_tokens
コード例 #21
0
ファイル: train.py プロジェクト: nboitout/melusine
 def __init__(
     self,
     pretrained_embedding=None,
     architecture_function=None,
     text_input_column="clean_text",
     meta_input_list=["extension", "dayofweek", "hour", "min"],
     vocab_size=25000,
     seq_size=100,
     embedding_dim=200,
     loss="categorical_crossentropy",
     activation="softmax",
     batch_size=4096,
     n_epochs=15,
     bert_tokenizer="jplu/tf-camembert-base",
     bert_model="jplu/tf-camembert-base",
     **kwargs,
 ):
     self.architecture_function = architecture_function
     self.pretrained_embedding = pretrained_embedding
     if self.architecture_function.__name__ != "bert_model":
         self.tokenizer = Tokenizer(input_column=text_input_column)
     elif "camembert" in bert_tokenizer.lower():
         self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer)
     elif "flaubert" in bert_tokenizer.lower():
         self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
     else:
         raise NotImplementedError(
             "Bert tokenizer {} not implemented".format(bert_tokenizer))
     self.text_input_column = text_input_column
     self.meta_input_list = meta_input_list
     self.vocab_size = vocab_size
     self.seq_size = seq_size
     self.embedding_dim = embedding_dim
     self.loss = loss
     self.activation = activation
     self.batch_size = batch_size
     self.n_epochs = n_epochs
     self.bert_model = bert_model
     self.nb_labels = 0
     self.nb_meta_features = 0
     self.vocabulary = []
     self.vocabulary_dict = {}
コード例 #22
0
    def __init__(self,
                 from_pretrained=None,
                 tokenizer="allegro/herbert-klej-cased-tokenizer-v1",
                 embed_model="allegro/herbert-klej-cased-v1"):
        super().__init__()

        self.tokenizer = XLMTokenizer.from_pretrained(tokenizer)
        self.embed_model = RobertaModel.from_pretrained(embed_model,
                                                        return_dict=True)

        self.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(768, 256),
                                nn.LeakyReLU(), nn.Linear(256, 16),
                                nn.LeakyReLU(), nn.Linear(16, 1), nn.Tanh())

        if from_pretrained is not None:
            f = io.BytesIO(
                importlib.resources.read_binary(trained_models,
                                                f'{from_pretrained}.pth'))
            self.fc.load_state_dict(torch.load(f))
            self.eval()
コード例 #23
0
def xlm_convert_to_huggingface(args):
   """
   Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save
   a HuggingFace XLMTokenizer and a XLMModel.
   """
   xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu'))

   with NamedTemporaryFile() as tfile:
      tfile.write(b'{}')
      tfile.flush()
      tokenizer = XLMTokenizer(
         tfile.name,
         args.merges,
         do_lowercase_and_remove_accent=False)
   tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id'])
   vocab_size = len(tokenizer)
      
   params = xlm_pth['params']
   xlm_config = XLMConfig(
      emb_dim=params['emb_dim'],
      vocab_size=params['n_words'],
      n_layers=params['n_layers'],
      n_heads=params['n_heads'],
      n_langs=params['n_langs'],
      sinusoidal_embeddings=params['sinusoidal_embeddings'],
      use_lang_emb=params['use_lang_emb'],
      is_encoder=params['encoder_only'],
      output_hidden_states=True,
      n_words = params['n_words'],
   )
   
   # Provide both config and state dict to model init
   model = XLMModel.from_pretrained(
      None,
      config=xlm_config,
      state_dict=xlm_pth['model'])

   # Save
   save_directory = Path(args.output_dir)
   if not save_directory.exists():
      save_directory.mkdir(parents=True, exist_ok=True)
   model.save_pretrained(str(save_directory))
   tokenizer.save_pretrained(str(save_directory))
   tokenizer.save_vocabulary(str(save_directory))
コード例 #24
0
def get_embedding_for_list_of_texts(
    list_of_texts: List[str], ) -> (torch.tensor, torch.tensor):
    """
    For a given list of sentences the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences)
    """
    tokenizer = XLMTokenizer.from_pretrained(
        join(dirname(realpath(__file__)), "models", "tokenizer"))
    bert_model = RobertaModel.from_pretrained(
        join(dirname(realpath(__file__)), "models", "bert"))

    emote_to_text = {}
    with open(join(dirname(realpath(__file__)), "emote_to_text.json"),
              encoding='utf8') as file:
        emote_to_text = json.load(file)

    list_of_texts = starmap(
        _replace_emotes_with_text,
        zip(list_of_texts, [emote_to_text] * len(list_of_texts)))

    list_of_texts = map(_remove_urls_from_text, list_of_texts)

    list_of_sentence_embeddings = []
    list_of_sequence_embeddings = []

    for text in list_of_texts:
        encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        outputs = bert_model(encoded_input)

        sequence_tokens_embedding = outputs[0].squeeze(dim=0)
        sentence_embedding = outputs[1].squeeze(dim=0)

        list_of_sequence_embeddings.append(sequence_tokens_embedding)
        list_of_sentence_embeddings.append(sentence_embedding)

    seq_embeddings_tensor = merge(list_of_sequence_embeddings)
    sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings,
                                             dim=0)

    return seq_embeddings_tensor, sentence_embeddings_tensor
コード例 #25
0
def build_tokenizer(model, add_cap_sign, textify_emoji, segment_hashtag, preprocess):
    if model == 'mbert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
    elif model =='xlm':
        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')

    tokenizer.add_tokens(['@USER']) # All Transformers models

    if add_cap_sign:
        tokenizer.add_tokens(['<has_cap>', '<all_cap>'])
    if textify_emoji:
        tokenizer.add_tokens(['<emoji>', '</emoji>'])
    if segment_hashtag:
        tokenizer.add_tokens(['<hashtag>', '</hashtag>'])

    #tokenizer.add_tokens([w.strip() for w in open('../resources/log_odds.txt').readlines()])

    # TODO: this is not saved when calling `save_pretrained`
    if preprocess is not None:
        tokenizer.tokenize = compose(preprocess, tokenizer.tokenize)

    return tokenizer
コード例 #26
0
    def __init__(self, device):
        super().__init__()

        self.net = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2),
            # nn.Linear(768, 2),
            nn.Tanh()
        )
        self.device = device
        self.tokenizer = XLMTokenizer.from_pretrained(
            # "models/politicalBERT")
            # "models/politicalHerBERT")
            "allegro/herbert-klej-cased-tokenizer-v1")
        self.model = RobertaModel.from_pretrained(
            # "models/politicalBERT",
            "models/politicalHerBERT",
            # "allegro/herbert-klej-cased-v1",
            return_dict=True)
コード例 #27
0
rw_vocab = get_vocab(filename, 10000)

filename2 = "SUBTLEX-US frequency list with PoS information text version.txt"
pos_dict = get_pos_dict(filename2)

GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True),
                 GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2")

Roberta = ModelInfo(
    RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True),
    RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta")

XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":
コード例 #28
0
def main():
    parser = argparse.ArgumentParser()
    # parser.add_argument('--dataset', default='txt', type=str, help='txt -> self-customized')
    # parser.add_argument('--src_lang', default='en', type=str, help='')
    # parser.add_argument('--tgt_lang', default='zh', type=str, help='')
    parser.add_argument(
        '--max_len_en',
        default=25,
        type=int,
        help='maximum length of English in **bilingual** corpus')
    parser.add_argument(
        '--max_len_zh',
        default=25,
        type=int,
        help='maximum length of Chinese in **bilingual** corpus')
    parser.add_argument("--src_file",
                        default='./.pkl',
                        type=str,
                        help="The input data file name.")

    # General
    parser.add_argument("--config_path",
                        default=None,
                        type=str,
                        help="Bert config file path.")
    parser.add_argument(
        "--bert_model",
        default="bert-base-cased",
        type=str,
        help=
        "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased."
    )
    parser.add_argument("--xml_vocab",
                        type=str,
                        default='./download_models/xml_vocab.json')
    parser.add_argument("--xml_merge",
                        type=str,
                        default='./download_models/xml_merges.txt')
    parser.add_argument("--model_recover_path",
                        default=None,
                        type=str,
                        help="The file of fine-tuned pretraining model.")
    parser.add_argument('--max_position_embeddings',
                        type=int,
                        default=512,
                        help="max position embeddings")

    # For decoding
    #parser.add_argument('--fp16', action='store_true',
    #                   help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--amp',
                        action='store_true',
                        help="Whether to use amp for fp16")
    parser.add_argument('--seed',
                        type=int,
                        default=123,
                        help="random seed for initialization")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument('--new_segment_ids',
                        action='store_true',
                        help="Use new segment ids for bi-uni-directional LM.")
    parser.add_argument('--batch_size',
                        type=int,
                        default=4,
                        help="Batch size for decoding.")
    parser.add_argument('--beam_size',
                        type=int,
                        default=1,
                        help="Beam size for searching")
    parser.add_argument('--length_penalty',
                        type=float,
                        default=0,
                        help="Length penalty for beam search")

    parser.add_argument('--forbid_duplicate_ngrams', action='store_true')
    parser.add_argument('--forbid_ignore_word',
                        type=str,
                        default=None,
                        help="Forbid the word during forbid_duplicate_ngrams")
    parser.add_argument("--min_len", default=None, type=int)
    parser.add_argument('--ngram_size', type=int, default=3)

    parser.add_argument('--drop_prob', default=0.1, type=float)
    parser.add_argument('--enable_butd',
                        action='store_true',
                        help='set to take in region features')
    parser.add_argument('--output_dir', default='./result', type=str)

    #useless
    parser.add_argument('--split', type=str, default='val')  #wmt?
    parser.add_argument('--len_vis_input',
                        type=int,
                        default=1,
                        help="The length of visual token input region 1")

    with open(
            '/data/private/chenyutong/dataset/concept_count/word_concept_count.pkl',
            'rb') as f:
        word_fre = pickle.load(f)
    word_fre = defaultdict(int, word_fre)

    args = parser.parse_args()

    assert args.batch_size == 1, 'only support batch_size=1'
    args.max_tgt_length = max(args.max_len_en, args.max_len_zh)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    # fix random seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    tokenizer_en = BertTokenizer.from_pretrained(
        args.bert_model,
        do_lower_case=args.do_lower_case,
        cache_dir=args.output_dir + '/.pretrained_model')
    if args.max_position_embeddings:
        tokenizer_en.max_len = args.max_position_embeddings
    #tokenizer_en= WhitespaceTokenizer() if args.tokenized_input else tokenizer_en
    tokenizer_zh = XLMTokenizer(args.xml_vocab, args.xml_merge)
    tokenizer_zh.tokenize = lambda x: tokenizer_zh._tokenize(
        x, lang='zh', bypass_tokenizer=False)
    with open(args.xml_vocab, 'r') as f:
        tokenizer_zh.vocab = json.load(f)
    indexer = Indexer(
        [os.path.join(args.bert_model, 'vocab.txt'), args.xml_vocab])
    with open('full_vocab.json', 'w') as f:
        json.dump(indexer.ids_to_tokens, f)
    tokenizers = {'en': tokenizer_en, 'zh': tokenizer_zh}
    print('tokenizer created')

    assert '.pkl' in args.src_file
    with open(args.src_file, 'rb') as f:
        src_data = pickle.load(f)
    # list [pred_id, vocab, vis, pos, distribution]
    # dict {'vgid':{'en':,'zh':,'region_features':[img, conf, fea[i], pos[i],dist]}}
    amp_handle = None
    if args.amp:
        from apex import amp

    # Prepare model
    cls_num_labels = 2
    type_vocab_size = 12 if args.new_segment_ids else 12
    mask_word_id, eos_word_ids = indexer(["[MASK]", "[SEP]"])
    forbid_ignore_set = None  #default None
    relax_projection, task_idx_proj = 0, 3
    if args.forbid_ignore_word:
        w_list = []
        for w in args.forbid_ignore_word.split('|'):
            if w.startswith('[') and w.endswith(']'):
                w_list.append(w.upper())
            else:
                w_list.append(w)
        forbid_ignore_set = set(indexer(w_list))

    print(args.model_recover_path)
    for model_recover_path in glob.glob(args.model_recover_path.strip()):
        #logger.info("***** Recover model: %s *****", model_recover_path)
        model_recover = torch.load(model_recover_path)
        model = BertForSeq2SeqDecoder.from_pretrained(
            args.bert_model,
            max_position_embeddings=args.max_position_embeddings,
            config_path=args.config_path,
            state_dict=model_recover,
            num_labels=cls_num_labels,
            vocab_size=len(indexer),
            type_vocab_size=type_vocab_size,
            task_idx=3,
            mask_word_id=mask_word_id,  #img2txt
            search_beam_size=args.beam_size,
            length_penalty=args.length_penalty,
            eos_id=eos_word_ids,
            forbid_duplicate_ngrams=args.forbid_duplicate_ngrams,
            forbid_ignore_set=forbid_ignore_set,
            ngram_size=args.ngram_size,
            min_len=args.min_len,
            enable_butd=args.enable_butd,
            len_vis_input=args.len_vis_input)

        del model_recover

        model.to(device)

        if args.amp:
            model = amp.initialize(model, opt_level='O2')  #'02')
        torch.cuda.empty_cache()
        model.eval()

        fout = open(os.path.join(args.output_dir, 'region2txt_output.txt'),
                    'w')
        output_lines = []
        select_ids = [87, 120, 179, 297, 721, 852, 1025]
        for step_val, sd in enumerate(src_data.items()):
            # if step_val>=1:
            #     break
            vgid, input_item = sd
            en, zh = input_item['en'], input_item['zh']
            fout.writelines('\n' + '#' * 10 + '\n')
            fout.writelines('{}\n'.format(vgid))
            fout.writelines('{} coco: word_fre {}  vis_fre {} \n'.format(
                en, input_item['coco_fre']['word'],
                input_item['coco_fre']['vis']))
            fout.writelines('{} aic: word_fre {}  vis_fre {} \n'.format(
                zh, input_item['aic_fre']['word'],
                input_item['aic_fre']['vis']))
            print('step_val {} Process {}'.format(step_val, en))
            for rf in tqdm(input_item['region_features']):
                filename, conf, vis_feats, vis_pe, cls_label = rf
                vis_feats = torch.from_numpy(vis_feats).to(device)
                vis_feats = vis_feats.unsqueeze(0)
                vis_pe = torch.from_numpy(vis_pe).to(device)
                vis_pe = vis_pe.unsqueeze(0)
                cls_label = torch.from_numpy(cls_label).to(device)
                cls_label = cls_label.unsqueeze(0)  #
                # lazy normalization of the coordinates... copy from seq2seq
                w_est = torch.max(vis_pe[:, [0, 2]]) * 1. + 1e-5
                h_est = torch.max(vis_pe[:, [1, 3]]) * 1. + 1e-5
                vis_pe[:, [0, 2]] /= w_est
                vis_pe[:, [1, 3]] /= h_est
                assert h_est > 0, 'should greater than 0! {}'.format(h_est)
                assert w_est > 0, 'should greater than 0! {}'.format(w_est)
                rel_area = (vis_pe[:, 3] - vis_pe[:, 1]) * (vis_pe[:, 2] -
                                                            vis_pe[:, 0])
                rel_area.clamp_(0)

                vis_pe = torch.cat(
                    (vis_pe[:, :4], rel_area.view(-1, 1), vis_pe[:, 5:]),
                    -1)  # confident score
                normalized_coord = F.normalize(vis_pe.data[:, :5] - 0.5,
                                               dim=-1)
                vis_pe = torch.cat((F.layer_norm(vis_pe, [6]), \
                    F.layer_norm(cls_label, [1601])), dim=-1) # 1601 hard coded... #BL,H

                vis_feats = vis_feats.unsqueeze(0)
                vis_pe = vis_pe.unsqueeze(0)
                #print('input shape', vis_feats.shape, vis_pe.shape)
                assert args.new_segment_ids == False, 'only support 0 1 6 now'
                tokens = ['[CLS]', '[UNK]', '[SEP]']
                input_ids = indexer(tokens)
                input_ids = np.expand_dims(np.array(input_ids), axis=0)
                input_ids = torch.tensor(input_ids,
                                         dtype=torch.long,
                                         device=device)

                max_len_in_batch = len(tokens) + args.max_tgt_length
                _tril_matrix = torch.tril(
                    torch.ones((max_len_in_batch, max_len_in_batch),
                               dtype=torch.long))
                input_mask = torch.zeros(max_len_in_batch,
                                         max_len_in_batch,
                                         dtype=torch.long,
                                         device=device)
                input_mask[:, :len(tokens)].fill_(1)
                second_st, second_end = len(tokens), max_len_in_batch
                input_mask[second_st:second_end, second_st:second_end].copy_(
                    _tril_matrix[:second_end - second_st, :second_end -
                                 second_st])  #L,L
                input_mask = input_mask.unsqueeze(0)

                position_ids = torch.arange(max_len_in_batch,
                                            dtype=torch.long,
                                            device=device)  #L
                position_ids = position_ids.unsqueeze(0)  # B,L

                predictions = {
                    'en': None,
                    'zh': None,
                    'en2zh': None,
                    'zh2en': None
                }
                for tgt_lang, lang_id in zip(['en', 'zh'], [1, 6]):
                    token_type_ids = [0] * len(
                        tokens) + [lang_id] * args.max_tgt_length
                    token_type_ids = np.expand_dims(np.array(token_type_ids),
                                                    axis=0)
                    token_type_ids = torch.tensor(token_type_ids,
                                                  dtype=torch.long,
                                                  device=device)
                    with torch.no_grad():
                        # print(token_type_ids[0])
                        # print(position_ids[0])
                        # print(input_ids[0])
                        # print(input_mask[0])
                        # input()
                        traces = model(
                            vis_feats=vis_feats,
                            vis_pe=vis_pe,
                            input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            attention_mask=input_mask,
                            search_beam_size=args.beam_size,
                            task_idx=3,
                            mode='img2txt',
                            sample_mode='greedy')  #validation greedy

                    output_sequence = postprocess(traces, args.beam_size,
                                                  tgt_lang, indexer)
                    predictions[tgt_lang] = output_sequence  #truncate

                for langs, lang_ids in zip(['en2zh', 'zh2en'],
                                           [[1, 6], [6, 1]]):
                    src_lang = langs[:2]  #en,zh
                    tgt_lang = langs[-2:]
                    w = predictions[
                        src_lang]  # predictions['en']/ predictions['zh']
                    w_t = tokenizers[src_lang].tokenize(w)
                    tokens = ['[CLS]'] + w_t + ['[SEP]']
                    input_ids = indexer(tokens)
                    token_type_ids = [lang_ids[0]] * len(
                        input_ids) + [lang_ids[1]] * args.max_tgt_length
                    input_ids = np.expand_dims(np.array(input_ids), axis=0)
                    token_type_ids = np.expand_dims(np.array(token_type_ids),
                                                    axis=0)
                    input_ids = torch.tensor(input_ids,
                                             dtype=torch.long,
                                             device=device)
                    token_type_ids = torch.tensor(token_type_ids,
                                                  dtype=torch.long,
                                                  device=device)

                    max_len_in_batch = len(
                        tokens) + args.max_tgt_length  #2+64 = 66
                    position_ids = torch.arange(max_len_in_batch,
                                                dtype=torch.long,
                                                device=device)  #L
                    position_ids = position_ids.unsqueeze(0)  # B,L
                    _tril_matrix = torch.tril(
                        torch.ones((max_len_in_batch, max_len_in_batch),
                                   dtype=torch.long))
                    input_mask = torch.zeros(max_len_in_batch,
                                             max_len_in_batch,
                                             dtype=torch.long,
                                             device=device)
                    input_mask[:, :len(tokens)].fill_(1)
                    second_st, second_end = len(tokens), max_len_in_batch
                    input_mask[second_st:second_end,
                               second_st:second_end].copy_(
                                   _tril_matrix[:second_end -
                                                second_st, :second_end -
                                                second_st])  #L,L
                    input_mask = input_mask.unsqueeze(0)
                    with torch.no_grad():
                        traces = model(
                            vis_feats=None,
                            vis_pe=None,
                            input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            attention_mask=input_mask,
                            search_beam_size=args.beam_size,
                            task_idx=3,
                            mode='txt2txt',
                            sample_mode='greedy')  #validation greedy
                    output_sequence = postprocess(traces, args.beam_size,
                                                  tgt_lang, indexer)
                    predictions[langs] = output_sequence

                #print(predictions)
                fout.writelines(
                    'conf:{:.2f} en:{: <10} fre:{:<5d} en2zh:{: <10} zh:{: <10} fre:{:<5d} zh2en:{: <10} \n'
                    .format(conf, predictions['en'],
                            word_fre['coco'][predictions['en']],
                            predictions['en2zh'], predictions['zh'],
                            word_fre['aic'][predictions['zh']],
                            predictions['zh2en']))

        fout.close()
コード例 #29
0
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
    # 6-layer, 1024-hidden, 8-heads
    # XLM English-French model trained on the concatenation of English and French wikipedia

else:
    print('need to define LM from Bert,RoBerta,XLM')

print(model)

def freeze_layer_fun(freeze_layer):
    for name, param in model.named_parameters():
        if freeze_layer in name:
            print(name)
            param.requires_grad = False
        else:
コード例 #30
0
ファイル: extraction.py プロジェクト: ahmed451/aux_classifier
def get_model_and_tokenizer(
    model_name, device="cpu", random_weights=False, model_path=None
):
    """
    model_path: if given, initialize from path instead of official repo
    models typically cached in ~/.cache/torch/transformers/

    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define QARiB https://huggingface.co/qarib/bert-base-qarib
    elif model_name.startswith("qarib"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert
    elif model_name.startswith("aubmindlab"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define ArabicBERT  https://huggingface.co/asafaya/bert-base-arabic
    elif model_name.startswith("asafaya"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    #Define https://huggingface.co/UBC-NLP/MARBERT
    elif model_name.startswith("UBC-NLP"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("bert-base-multilingual"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"

    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True
        ).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to(
            device
        )
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep