Esempio n. 1
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
Esempio n. 2
0
def get_attentions():
    model_name = request.args.get('model')
    source = request.args.get('source')
    target = request.args.get('target')

    if model_name == 'XLM':
        model_version = 'xlm-mlm-ende-1024'
        model = XLMModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = XLMTokenizer.from_pretrained(model_version)
    elif model_name == 'GPT-2':
        model_version = 'gpt2'
        model = GPT2Model.from_pretrained(model_version, output_attentions=True)
        tokenizer = GPT2Tokenizer.from_pretrained(model_version)
    else:
        # BERT
        model_version = 'bert-base-uncased'
        model = BertModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)

    inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True)
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention = model(input_ids, token_type_ids=token_type_ids)[-1]
    input_id_list = input_ids[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
    def load_model_tokenizer(self, pretrained):
        """ Load transformer model and tokenizer for given pre-trained name 
        
        :param pretrained: pre-trained name
        :return: model, tokenizer
        """
        
        model = None
        tokenizer = None
        
        if self.method == "T5":
            if pretrained in T5_PRETRAINED_MODELS:
                model = T5ForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = T5Tokenizer.from_pretrained(pretrained)
        elif self.method == "BART":
            if pretrained in BART_PRETRAINED_MODELS:
                model = BartForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = BartTokenizer.from_pretrained(pretrained)
        elif self.method == "GPT-2":
            if pretrained in GPT2_PRETRAINED_MODELS:
                model = GPT2LMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
        elif self.method == "XLM":
            if pretrained in XLM_PRETRAINED_MODELS:
                model = XLMWithLMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = XLMTokenizer.from_pretrained(pretrained)
        else:
            pass

        return model, tokenizer
Esempio n. 4
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
Esempio n. 5
0
    def __init__(
        self,
        pretrained_embedding=None,
        architecture_function=None,
        text_input_column="clean_text",
        meta_input_list=("extension", "dayofweek", "hour", "min"),
        vocab_size=25000,
        seq_size=100,
        embedding_dim=200,
        loss="categorical_crossentropy",
        activation="softmax",
        batch_size=4096,
        n_epochs=15,
        bert_tokenizer="jplu/tf-camembert-base",
        bert_model="jplu/tf-camembert-base",
        **kwargs,
    ):
        self.architecture_function = architecture_function
        self.pretrained_embedding = pretrained_embedding
        if self.architecture_function.__name__ != "bert_model":
            self.tokenizer = Tokenizer(input_column=text_input_column)
        elif "camembert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import CamembertTokenizer

                self.tokenizer = CamembertTokenizer.from_pretrained(
                    bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        elif "flaubert" in bert_tokenizer.lower():
            # Prevent the HuggingFace dependency
            try:
                from transformers import XLMTokenizer

                self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
            except ModuleNotFoundError:
                raise (
                    """Please install transformers 3.4.0 (only version currently supported)
                    pip install melusine[transformers]""")
        else:
            raise NotImplementedError(
                "Bert tokenizer {} not implemented".format(bert_tokenizer))
        self.text_input_column = text_input_column
        self.meta_input_list = meta_input_list
        self.vocab_size = vocab_size
        self.seq_size = seq_size
        self.embedding_dim = embedding_dim
        self.loss = loss
        self.activation = activation
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.bert_model = bert_model
        self.nb_labels = 0
        self.nb_meta_features = 0
        self.vocabulary = []
        self.vocabulary_dict = {}
Esempio n. 6
0
 def _test_TFXLM(self, size, large=False):
     from transformers import TFXLMModel, XLMTokenizer
     tokenizer = XLMTokenizer.from_pretrained(size)
     model = TFXLMModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["last_hidden_state"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, atol=0.005)
def test_model(modelname):
    model, log = XLMModel.from_pretrained(modelname, output_loading_info=True)
    tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False)

    # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag
    tokenizer.do_lowercase_and_remove_accent = False
    print("Dictionary values must be empty lists:")
    print(log)
Esempio n. 8
0
def test_space_tokenization_and_xlm_uncased_tokenization_normalization():
    text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ."
    space_tokenized = text.split(" ")
    tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
    target_tokenized = tokenizer.tokenize(text)
    normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations(
        space_tokenized, target_tokenized, tokenizer)
    assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
Esempio n. 9
0
def create_tokenizer(bert_pretrained):
    """
    Wrapper function returning a tokenizer for BERT.
    """
    if bert_pretrained.startswith("xlm"):
        return XLMTokenizer.from_pretrained(bert_pretrained)
    else:
        return BertTokenizer.from_pretrained(bert_pretrained)
Esempio n. 10
0
 def init(args):
     BERTTool.multi_bert = XLMModel.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_tokener = XLMTokenizer.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<pad>"])[0]
     BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["</s>"])[0]
     BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<s>"])[0]
Esempio n. 11
0
def get_model_and_tokenizer(model_name, device, random_weights=False):

    model_name = model_name

    if model_name.startswith('xlnet'):
        model = XLNetModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        sep = u'▁'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('gpt2'):
        model = GPT2Model.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        sizes = {
            "gpt2": 768,
            "gpt2-medium": 1024,
            "gpt2-large": 1280,
            "gpt2-xl": 1600
        }
        emb_dim = sizes[model_name]
    elif model_name.startswith('xlm'):
        model = XLMModel.from_pretrained(model_name,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(model_name)
        sep = '</w>'
    elif model_name.startswith('bert'):
        model = BertModel.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('distilbert'):
        model = DistilBertModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 768
    elif model_name.startswith('roberta'):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        emb_dim = 1024 if "large" in model_name else 768
    else:
        print('Unrecognized model name:', model_name)
        sys.exit()

    if random_weights:
        print('Randomizing weights')
        model.init_weights()

    return model, tokenizer, sep, emb_dim
Esempio n. 12
0
 def __init__(self):
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu"
     )
     self.tokenizer = XLMTokenizer.from_pretrained(
         'allegro/herbert-klej-cased-tokenizer-v1'
     )
     self.model = RobertaModel.from_pretrained(
         'allegro/herbert-klej-cased-v1'
     )
     self.model = self.model.to(self.device)
Esempio n. 13
0
 def test_TFXLMForQuestionAnsweringSimple(self):
     from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
     pretrained_weights = 'xlm-mlm-enfr-1024'
     tokenizer = XLMTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFXLMForQuestionAnsweringSimple.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
Esempio n. 14
0
def add_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name
    log.info('In add_transformers_vocab')
    log.info(tokenizer_name)
    if tokenizer_name.startswith(
            "bert-"
    ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith(
            "roberta-"):  # or 'roberta' in tokenizer_name:
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("albert-"):
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name:
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-roberta"):
        tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added transformers vocab (%s): %d tokens", tokenizer_name,
             len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
Esempio n. 15
0
def get_model_and_tokenizer(model_name,
                            device="cpu",
                            random_weights=False,
                            model_path=None):
    """
    model_path: if given, initialize from path instead of official repo
    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep
 def __init__(self, model_type):
     """Constructor
     :param model_type: which model is used, xlm or mbert
     """
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         model = XLMModel.from_pretrained('xlm-mlm-100-1280')
         self.embeddings = model.embeddings.weight
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
         model = BertModel.from_pretrained('bert-base-multilingual-uncased')
         self.embeddings = model.embeddings.word_embeddings.weight
     self.emb_dim = self.embeddings.shape[1]
Esempio n. 17
0
 def test_TFXLMWithLMHeadModel(self):
     from transformers import XLMTokenizer, TFXLMWithLMHeadModel
     pretrained_weights = 'xlm-mlm-enfr-1024'
     tokenizer = XLMTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFXLMWithLMHeadModel.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name,
                          onnx_model,
                          inputs_onnx,
                          predictions,
                          self.model_files,
                          rtol=1.e-2,
                          atol=1.e-4))
 def __init__(self, model_type):
     """Constructor
     :param model_type: if and xlm or bert model is used
     """
     # Instantiate model and tokenizers from pre-trained multilingual versions
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         self.model = XLMModel.from_pretrained('xlm-mlm-100-1280',
                                               output_hidden_states=True)
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-uncased')
         self.model = BertModel.from_pretrained(
             'bert-base-multilingual-uncased', output_hidden_states=True)
     else:
         raise ValueError(
             'Unrecognized model type. Only bert and xlm supported')
Esempio n. 19
0
def get_embedding_for_text(text: str) -> (torch.tensor, torch.tensor):
    """
    For a given sentence the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentnece, average embedding of a sentence)
    """
    tokenizer = XLMTokenizer.from_pretrained(
        join(dirname(realpath(__file__)), "models", "tokenizer"))
    bert_model = RobertaModel.from_pretrained(
        join(dirname(realpath(__file__)), "models", "bert"))

    encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
    outputs = bert_model(encoded_input)

    sequence_tokens_embedding = outputs[0].squeeze(dim=0)
    sentence_embedding = outputs[1].squeeze(dim=0)
    return sequence_tokens_embedding, sentence_embedding
Esempio n. 20
0
def convert_id_to_token(indexed_tokens, model_name):

    if model_name == "bert":
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    elif model_name == "xlnet":
        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    elif model_name == "xlm":
        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
    elif model_name == "electra":
        tokenizer = ElectraTokenizer.from_pretrained(
            "google/electra-small-discriminator")
    elif model_name == "albert":
        tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

    word_tokens = [
        tokenizer.convert_ids_to_tokens(indexed_token)
        for indexed_token in indexed_tokens
    ]
    return word_tokens
Esempio n. 21
0
 def __init__(
     self,
     pretrained_embedding=None,
     architecture_function=None,
     text_input_column="clean_text",
     meta_input_list=["extension", "dayofweek", "hour", "min"],
     vocab_size=25000,
     seq_size=100,
     embedding_dim=200,
     loss="categorical_crossentropy",
     activation="softmax",
     batch_size=4096,
     n_epochs=15,
     bert_tokenizer="jplu/tf-camembert-base",
     bert_model="jplu/tf-camembert-base",
     **kwargs,
 ):
     self.architecture_function = architecture_function
     self.pretrained_embedding = pretrained_embedding
     if self.architecture_function.__name__ != "bert_model":
         self.tokenizer = Tokenizer(input_column=text_input_column)
     elif "camembert" in bert_tokenizer.lower():
         self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer)
     elif "flaubert" in bert_tokenizer.lower():
         self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer)
     else:
         raise NotImplementedError(
             "Bert tokenizer {} not implemented".format(bert_tokenizer))
     self.text_input_column = text_input_column
     self.meta_input_list = meta_input_list
     self.vocab_size = vocab_size
     self.seq_size = seq_size
     self.embedding_dim = embedding_dim
     self.loss = loss
     self.activation = activation
     self.batch_size = batch_size
     self.n_epochs = n_epochs
     self.bert_model = bert_model
     self.nb_labels = 0
     self.nb_meta_features = 0
     self.vocabulary = []
     self.vocabulary_dict = {}
Esempio n. 22
0
    def __init__(self,
                 from_pretrained=None,
                 tokenizer="allegro/herbert-klej-cased-tokenizer-v1",
                 embed_model="allegro/herbert-klej-cased-v1"):
        super().__init__()

        self.tokenizer = XLMTokenizer.from_pretrained(tokenizer)
        self.embed_model = RobertaModel.from_pretrained(embed_model,
                                                        return_dict=True)

        self.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(768, 256),
                                nn.LeakyReLU(), nn.Linear(256, 16),
                                nn.LeakyReLU(), nn.Linear(16, 1), nn.Tanh())

        if from_pretrained is not None:
            f = io.BytesIO(
                importlib.resources.read_binary(trained_models,
                                                f'{from_pretrained}.pth'))
            self.fc.load_state_dict(torch.load(f))
            self.eval()
Esempio n. 23
0
def get_embedding_for_list_of_texts(
    list_of_texts: List[str], ) -> (torch.tensor, torch.tensor):
    """
    For a given list of sentences the function return embedding generated by BERT
    :param text: Sentence for which u want to get an embedding
    :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences)
    """
    tokenizer = XLMTokenizer.from_pretrained(
        join(dirname(realpath(__file__)), "models", "tokenizer"))
    bert_model = RobertaModel.from_pretrained(
        join(dirname(realpath(__file__)), "models", "bert"))

    emote_to_text = {}
    with open(join(dirname(realpath(__file__)), "emote_to_text.json"),
              encoding='utf8') as file:
        emote_to_text = json.load(file)

    list_of_texts = starmap(
        _replace_emotes_with_text,
        zip(list_of_texts, [emote_to_text] * len(list_of_texts)))

    list_of_texts = map(_remove_urls_from_text, list_of_texts)

    list_of_sentence_embeddings = []
    list_of_sequence_embeddings = []

    for text in list_of_texts:
        encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
        outputs = bert_model(encoded_input)

        sequence_tokens_embedding = outputs[0].squeeze(dim=0)
        sentence_embedding = outputs[1].squeeze(dim=0)

        list_of_sequence_embeddings.append(sequence_tokens_embedding)
        list_of_sentence_embeddings.append(sentence_embedding)

    seq_embeddings_tensor = merge(list_of_sequence_embeddings)
    sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings,
                                             dim=0)

    return seq_embeddings_tensor, sentence_embeddings_tensor
Esempio n. 24
0
def build_tokenizer(model, add_cap_sign, textify_emoji, segment_hashtag, preprocess):
    if model == 'mbert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
    elif model =='xlm':
        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')

    tokenizer.add_tokens(['@USER']) # All Transformers models

    if add_cap_sign:
        tokenizer.add_tokens(['<has_cap>', '<all_cap>'])
    if textify_emoji:
        tokenizer.add_tokens(['<emoji>', '</emoji>'])
    if segment_hashtag:
        tokenizer.add_tokens(['<hashtag>', '</hashtag>'])

    #tokenizer.add_tokens([w.strip() for w in open('../resources/log_odds.txt').readlines()])

    # TODO: this is not saved when calling `save_pretrained`
    if preprocess is not None:
        tokenizer.tokenize = compose(preprocess, tokenizer.tokenize)

    return tokenizer
    def __init__(self, device):
        super().__init__()

        self.net = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2),
            # nn.Linear(768, 2),
            nn.Tanh()
        )
        self.device = device
        self.tokenizer = XLMTokenizer.from_pretrained(
            # "models/politicalBERT")
            # "models/politicalHerBERT")
            "allegro/herbert-klej-cased-tokenizer-v1")
        self.model = RobertaModel.from_pretrained(
            # "models/politicalBERT",
            "models/politicalHerBERT",
            # "allegro/herbert-klej-cased-v1",
            return_dict=True)
rw_vocab = get_vocab(filename, 10000)

filename2 = "SUBTLEX-US frequency list with PoS information text version.txt"
pos_dict = get_pos_dict(filename2)

GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True),
                 GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2")

Roberta = ModelInfo(
    RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True),
    RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta")

XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":
Esempio n. 27
0
                           )
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
    # 6-layer, 1024-hidden, 8-heads
    # XLM English-French model trained on the concatenation of English and French wikipedia

else:
    print('need to define LM from Bert,RoBerta,XLM')

print(model)

def freeze_layer_fun(freeze_layer):
    for name, param in model.named_parameters():
        if freeze_layer in name:
            print(name)
            param.requires_grad = False
        else:
Esempio n. 28
0
def get_model_and_tokenizer(
    model_name, device="cpu", random_weights=False, model_path=None
):
    """
    model_path: if given, initialize from path instead of official repo
    models typically cached in ~/.cache/torch/transformers/

    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define QARiB https://huggingface.co/qarib/bert-base-qarib
    elif model_name.startswith("qarib"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert
    elif model_name.startswith("aubmindlab"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define ArabicBERT  https://huggingface.co/asafaya/bert-base-arabic
    elif model_name.startswith("asafaya"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    #Define https://huggingface.co/UBC-NLP/MARBERT
    elif model_name.startswith("UBC-NLP"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("bert-base-multilingual"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"

    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True
        ).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to(
            device
        )
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep
from transformers import XLMTokenizer, RobertaModel
from sentimentpl.models import SentimentPLModel
import os
import pandas as pd
import pickle
import numpy as np
import tqdm
import torch

device = torch.device("cuda")

tokenizer = XLMTokenizer.from_pretrained("models/politicalBERT")
model = RobertaModel.from_pretrained("models/politicalBERT", return_dict=True)
model = model.to(device)


def text2vec(text):
    encoded = tokenizer.encode(text, return_tensors='pt')
    return model(encoded)['pooler_output'].detach().numpy()[0]


def texts2vec(text):
    encoded = tokenizer(text, return_tensors='pt', padding=True)
    encoded = {k: v.to(device) for k, v in encoded.items()}
    output = model(**encoded)
    return output['pooler_output'].detach().cpu().numpy(
    ), output['last_hidden_state'].detach().cpu().numpy(),


sentiment_model = SentimentPLModel(from_pretrained='latest').cuda()
Esempio n. 30
0
def test_xlm_embeddings():
    xlm_model: str = "xlm-mlm-en-2048"

    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #    0      1             2           3            4          5         6         7         8       9      10        11       12         13        14
    #
    #   <s>  'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s>
    #           |             |           |            |          |         |         |         \      |      /          |         |          |
    #         Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #           0             1           2            3          4         5          6               7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLMEmbeddings(
            pretrained_model_name_or_path=xlm_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * model.embeddings.embedding_dim
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * model.embeddings.embedding_dim
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size