Ejemplo n.º 1
0
 def __init__(self, auto_model: str, auto_path: str):
     super().__init__()
     if "camembert" in auto_model:
         from transformers import CamembertModel, CamembertTokenizer
         self.auto_embeddings = CamembertModel.from_pretrained(auto_path)
         self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path)
     elif "flaubert" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
         self.auto_tokenizer.do_lowercase_and_remove_accent = False
     elif "xlm" in auto_model:
         from transformers import XLMModel, XLMTokenizer
         self.auto_embeddings = XLMModel.from_pretrained(auto_path)
         self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path)
     elif "bert" in auto_model:
         from transformers import BertModel, BertTokenizer
         self.auto_embeddings = BertModel.from_pretrained(auto_path)
         self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path)
     else:
         from transformers import AutoModel, AutoTokenizer, XLMTokenizer
         self.auto_embeddings = AutoModel.from_pretrained(auto_path)
         self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path)
         if isinstance(self.auto_tokenizer, XLMTokenizer):
             self.auto_tokenizer.do_lowercase_and_remove_accent = False
     for param in self.auto_embeddings.parameters():
         param.requires_grad = False
     self._is_fixed = True
     self._output_dim = self.auto_embeddings.config.hidden_size
     self._begin_special_token_count = self.get_begin_special_token_count()
     self._padding_id = self.auto_tokenizer.pad_token_id
Ejemplo n.º 2
0
    def __init__(self,
                 vocabs: Dict[str, Vocabulary],
                 config: Config,
                 pre_load_model: bool = True):
        super().__init__(config=config)

        if pre_load_model:
            self.xlm = XLMModel.from_pretrained(self.config.model_name,
                                                output_hidden_states=True)
        else:
            xlm_config = XLMConfig.from_pretrained(self.config.model_name,
                                                   output_hidden_states=True)
            self.xlm = XLMModel(xlm_config)

        self.source_lang_id = self.xlm.config.lang2id.get(
            self.config.source_language)
        self.target_lang_id = self.xlm.config.lang2id.get(
            self.config.target_language)

        if None in (self.source_lang_id, self.target_lang_id):
            raise ValueError(
                f'Invalid lang_id for XLM model.'
                f' Valid ids are: {self.xlm.config.lang2id.keys()}')

        self.mlp = None
        if self.config.use_mlp:
            self.mlp = nn.Sequential(
                nn.Linear(self.xlm.config.hidden_size,
                          self.config.hidden_size),
                nn.Tanh(),
            )
            output_size = self.config.hidden_size
        else:
            output_size = self.xlm.config.hidden_size

        self._sizes = {
            const.TARGET: output_size,
            const.TARGET_LOGITS: output_size,
            const.TARGET_SENTENCE: 2 * output_size,
            const.SOURCE: output_size,
            const.SOURCE_LOGITS: output_size,
        }

        self.vocabs = {
            const.TARGET: vocabs[const.TARGET],
            const.SOURCE: vocabs[const.SOURCE],
        }

        self.output_embeddings = self.xlm.embeddings

        if self.config.freeze:
            for param in self.xlm.parameters():
                param.requires_grad = False
Ejemplo n.º 3
0
def get_attentions():
    model_name = request.args.get('model')
    source = request.args.get('source')
    target = request.args.get('target')

    if model_name == 'XLM':
        model_version = 'xlm-mlm-ende-1024'
        model = XLMModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = XLMTokenizer.from_pretrained(model_version)
    elif model_name == 'GPT-2':
        model_version = 'gpt2'
        model = GPT2Model.from_pretrained(model_version, output_attentions=True)
        tokenizer = GPT2Tokenizer.from_pretrained(model_version)
    else:
        # BERT
        model_version = 'bert-base-uncased'
        model = BertModel.from_pretrained(model_version, output_attentions=True)
        tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True)

    inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True)
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention = model(input_ids, token_type_ids=token_type_ids)[-1]
    input_id_list = input_ids[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_id_list)
    return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
Ejemplo n.º 4
0
 def __init__(self, config):
     super().__init__(config)
     self.num_labels = config.num_labels
     self.transformer = XLMModel(config)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.init_weights()
     self.dropout = nn.Dropout(0.1)
Ejemplo n.º 5
0
def get_transformers_model(
    settings: Dict[str, Any],
    model_name: str,
    pretrained: bool = True,
    ckptdir: Optional[Path] = None,
) -> PreTrainedModel:
    model_path = model_name if pretrained else str(ckptdir)
    config = AutoConfig.from_pretrained(model_path)
    config.attention_probs_dropout_prob = settings.get(
        'encoder_attn_dropout_rate', 0.1)
    config.hidden_dropout_prob = settings.get('encoder_ffn_dropout_rate', 0.1)
    config.layer_norm_eps = settings.get('layer_norm_eps', 1e-5)

    if pretrained:
        model = AutoModel.from_pretrained(model_name, config=config)
        return model

    # if you want not parameters but only model structure, each model class is needed.
    if 'xlm' in model_name:
        model = XLMModel(config=config)
    elif 'albert' in model_name:
        model = AlbertModel(config=config)
    elif 'roberta' in model_name:
        model = RobertaModel(config=config)
    elif 'deberta-v2' in model_name:
        model = DebertaV2Model(config=config)
    elif 'deberta' in model_name:
        model = DebertaModel(config=config)
    elif 'bert' in model_name:
        model = BertModel(config=config)
    elif 'electra' in model_name:
        model = ElectraModel(config=config)
    else:
        model = BertModel(config=config)
    return model
Ejemplo n.º 6
0
def xlm_model():
    config = XLMConfig(
        vocab_size=93000,
        emb_dim=32,
        n_layers=5,
        n_heads=4,
        dropout=0.1,
        max_position_embeddings=512,
        lang2id={
            "ar": 0,
            "bg": 1,
            "de": 2,
            "el": 3,
            "en": 4,
            "es": 5,
            "fr": 6,
            "hi": 7,
            "ru": 8,
            "sw": 9,
            "th": 10,
            "tr": 11,
            "ur": 12,
            "vi": 13,
            "zh": 14,
        },
    )
    return XLMModel(config=config)
Ejemplo n.º 7
0
def test_model(modelname):
    model, log = XLMModel.from_pretrained(modelname, output_loading_info=True)
    tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False)

    # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag
    tokenizer.do_lowercase_and_remove_accent = False
    print("Dictionary values must be empty lists:")
    print(log)
Ejemplo n.º 8
0
    def __init__(self, config):
        super(XLMForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.transformer = XLMModel(config)
        self.sequence_summary = SequenceSummary(config)

        self.init_weights()
Ejemplo n.º 9
0
 def create_and_check_xlm_model(self, config, input_ids, token_type_ids,
                                input_lengths, sequence_labels,
                                token_labels, is_impossible_labels,
                                input_mask):
     model = XLMModel(config=config)
     model.eval()
     outputs = model(input_ids,
                     lengths=input_lengths,
                     langs=token_type_ids)
     outputs = model(input_ids, langs=token_type_ids)
     outputs = model(input_ids)
     sequence_output = outputs[0]
     result = {
         "sequence_output": sequence_output,
     }
     self.parent.assertListEqual(
         list(result["sequence_output"].size()),
         [self.batch_size, self.seq_length, self.hidden_size])
Ejemplo n.º 10
0
def get_model_and_tokenizer(model_name, device, random_weights=False):

    model_name = model_name

    if model_name.startswith('xlnet'):
        model = XLNetModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        sep = u'▁'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('gpt2'):
        model = GPT2Model.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        sizes = {
            "gpt2": 768,
            "gpt2-medium": 1024,
            "gpt2-large": 1280,
            "gpt2-xl": 1600
        }
        emb_dim = sizes[model_name]
    elif model_name.startswith('xlm'):
        model = XLMModel.from_pretrained(model_name,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(model_name)
        sep = '</w>'
    elif model_name.startswith('bert'):
        model = BertModel.from_pretrained(model_name,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 1024 if "large" in model_name else 768
    elif model_name.startswith('distilbert'):
        model = DistilBertModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        sep = '##'
        emb_dim = 768
    elif model_name.startswith('roberta'):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = 'Ġ'
        emb_dim = 1024 if "large" in model_name else 768
    else:
        print('Unrecognized model name:', model_name)
        sys.exit()

    if random_weights:
        print('Randomizing weights')
        model.init_weights()

    return model, tokenizer, sep, emb_dim
Ejemplo n.º 11
0
 def init(args):
     BERTTool.multi_bert = XLMModel.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_tokener = XLMTokenizer.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<pad>"])[0]
     BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["</s>"])[0]
     BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["<s>"])[0]
Ejemplo n.º 12
0
def get_model_and_tokenizer(model_name,
                            device="cpu",
                            random_weights=False,
                            model_path=None):
    """
    model_path: if given, initialize from path instead of official repo
    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model,
                                         output_hidden_states=True).to(device)
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model,
                                          output_hidden_states=True).to(device)
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(
            model_name, output_hidden_states=True).to(device)
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep
 def __init__(self, model_type):
     """Constructor
     :param model_type: which model is used, xlm or mbert
     """
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         model = XLMModel.from_pretrained('xlm-mlm-100-1280')
         self.embeddings = model.embeddings.weight
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
         model = BertModel.from_pretrained('bert-base-multilingual-uncased')
         self.embeddings = model.embeddings.word_embeddings.weight
     self.emb_dim = self.embeddings.shape[1]
Ejemplo n.º 14
0
def build_model(model, time_pooling, layer_pooling, layer, new_num_tokens,
                device, **kwargs):
    n_class = 2
    if model == 'mbert':
        base_model = BertModel.from_pretrained('bert-base-multilingual-uncased',
                                               output_hidden_states=True, **kwargs)
    elif model == 'xlm':
        base_model = XLMModel.from_pretrained('xlm-mlm-100-1280',
                                              output_hidden_states=True, **kwargs)
    base_model.resize_token_embeddings(new_num_tokens) # All transformers models

    model = PoolClassifier(base_model, n_class, time_pooling, layer_pooling, layer)
    return model.to(device)
Ejemplo n.º 15
0
    def __init__(self, config):
        BertPreTrainedModel.__init__(config)
        XLMPreTrainedModel.__init__(config)
        self.num_labels = BertPreTrainedModel.config.num_labels
        self.bert = BertModel(config)

        self.classifier = nn.Linear(BertPreTrainedModel.config.hidden_size + XLMPreTrainedModel.config.hidden_size, config.num_labels)
        self.init_weights()

        #self.num_labels = config.num_labels
        self.transformer = XLMModel(config)
        self.init_weights()

        self.dropout = nn.Dropout(0.1)
Ejemplo n.º 16
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels_list

        self.transformer = XLMModel(config)
        self.dropout = nn.Dropout(config.dropout)

        self.pooler = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size), nn.Tanh())
        self.classifiers = nn.ModuleList([
            nn.Linear(config.hidden_size, num_label)
            for num_label in self.num_labels
        ])

        self.init_weights()
 def __init__(self, model_type):
     """Constructor
     :param model_type: if and xlm or bert model is used
     """
     # Instantiate model and tokenizers from pre-trained multilingual versions
     if model_type == 'xlm':
         self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280')
         self.model = XLMModel.from_pretrained('xlm-mlm-100-1280',
                                               output_hidden_states=True)
     elif model_type == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(
             'bert-base-multilingual-uncased')
         self.model = BertModel.from_pretrained(
             'bert-base-multilingual-uncased', output_hidden_states=True)
     else:
         raise ValueError(
             'Unrecognized model type. Only bert and xlm supported')
Ejemplo n.º 18
0
    def __init__(self, config: Munch = None, **kwargs):
        """ Initialize a new XLM synapse module.

        Args:
            config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(XLMSynapse, self).__init__(config=config, **kwargs)
        if config == None:
            config = XLMSynapse.default_config()
        bittensor.config.Config.update_with_kwargs(config.synapse, kwargs)
        XLMSynapse.check_config(config)
        self.config = config

        # Build config.
        xlm_config = XLMConfig(
            vocab_size=bittensor.__vocab_size__,
            emb_dim=bittensor.__network_dim__,
            n_layers=config.synapse.n_layers,
            n_heads=config.synapse.n_heads,
            # More needed
        )

        # model layer: encodes tokenized sequences to network dim.
        self.xlm = XLMModel(xlm_config)

        # pooler layer: pools the hidden units for use by the pkm dendrite rpc query.
        self.pooler = XLMPooler(xlm_config)

        # router: (PKM layer) queries network using embeddings as context
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden layer: transforms context and encoding to network dimension hidden units.
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target layer: maps from hidden layer to vocab dimension for each token.
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)
Ejemplo n.º 19
0
def xlm_convert_to_huggingface(args):
   """
   Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save
   a HuggingFace XLMTokenizer and a XLMModel.
   """
   xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu'))

   with NamedTemporaryFile() as tfile:
      tfile.write(b'{}')
      tfile.flush()
      tokenizer = XLMTokenizer(
         tfile.name,
         args.merges,
         do_lowercase_and_remove_accent=False)
   tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id'])
   vocab_size = len(tokenizer)
      
   params = xlm_pth['params']
   xlm_config = XLMConfig(
      emb_dim=params['emb_dim'],
      vocab_size=params['n_words'],
      n_layers=params['n_layers'],
      n_heads=params['n_heads'],
      n_langs=params['n_langs'],
      sinusoidal_embeddings=params['sinusoidal_embeddings'],
      use_lang_emb=params['use_lang_emb'],
      is_encoder=params['encoder_only'],
      output_hidden_states=True,
      n_words = params['n_words'],
   )
   
   # Provide both config and state dict to model init
   model = XLMModel.from_pretrained(
      None,
      config=xlm_config,
      state_dict=xlm_pth['model'])

   # Save
   save_directory = Path(args.output_dir)
   if not save_directory.exists():
      save_directory.mkdir(parents=True, exist_ok=True)
   model.save_pretrained(str(save_directory))
   tokenizer.save_pretrained(str(save_directory))
   tokenizer.save_vocabulary(str(save_directory))
 def create_and_check_xlm_model(
     self,
     config,
     input_ids,
     token_type_ids,
     input_lengths,
     sequence_labels,
     token_labels,
     is_impossible_labels,
     choice_labels,
     input_mask,
 ):
     model = XLMModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, lengths=input_lengths, langs=token_type_ids)
     result = model(input_ids, langs=token_type_ids)
     result = model(input_ids)
     self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
Ejemplo n.º 21
0
 def test_model_from_pretrained(self):
     for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
         model = XLMModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
Ejemplo n.º 22
0
def get_model_and_tokenizer(
    model_name, device="cpu", random_weights=False, model_path=None
):
    """
    model_path: if given, initialize from path instead of official repo
    models typically cached in ~/.cache/torch/transformers/

    """

    init_model = model_name
    if model_path:
        print("Initializing model from local path:", model_path)
        init_model = model_path

    if model_name.startswith("xlnet"):
        model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLNetTokenizer.from_pretrained(init_model)
        sep = u"▁"
    elif model_name.startswith("gpt2"):
        model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = GPT2Tokenizer.from_pretrained(init_model)
        sep = "Ġ"
    elif model_name.startswith("xlm"):
        model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = XLMTokenizer.from_pretrained(init_model)
        sep = "</w>"
    elif model_name.startswith("bert"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define QARiB https://huggingface.co/qarib/bert-base-qarib
    elif model_name.startswith("qarib"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert
    elif model_name.startswith("aubmindlab"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    # Define ArabicBERT  https://huggingface.co/asafaya/bert-base-arabic
    elif model_name.startswith("asafaya"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    #Define https://huggingface.co/UBC-NLP/MARBERT
    elif model_name.startswith("UBC-NLP"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("bert-base-multilingual"):
        model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(
            device
        )
        tokenizer = BertTokenizer.from_pretrained(init_model)
        sep = "##"

    elif model_name.startswith("distilbert"):
        model = DistilBertModel.from_pretrained(
            init_model, output_hidden_states=True
        ).to(device)
        tokenizer = DistilBertTokenizer.from_pretrained(init_model)
        sep = "##"
    elif model_name.startswith("roberta"):
        model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to(
            device
        )
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        sep = "Ġ"
    else:
        print("Unrecognized model name:", model_name)
        sys.exit()

    if random_weights:
        print("Randomizing weights")
        model.init_weights()

    return model, tokenizer, sep
Ejemplo n.º 23
0
 def test_model_from_pretrained(self):
     for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = XLMModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
Ejemplo n.º 24
0
def test_xlm_embeddings():
    xlm_model: str = "xlm-mlm-en-2048"

    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #    0      1             2           3            4          5         6         7         8       9      10        11       12         13        14
    #
    #   <s>  'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s>
    #           |             |           |            |          |         |         |         \      |      /          |         |          |
    #         Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #           0             1           2            3          4         5          6               7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLMEmbeddings(
            pretrained_model_name_or_path=xlm_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * model.embeddings.embedding_dim
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * model.embeddings.embedding_dim
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Ejemplo n.º 25
0
 def test_model_from_pretrained(self):
     cache_dir = "/tmp/transformers_test/"
     for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(model)
Ejemplo n.º 26
0
def get_embedding(type_embedding, data):
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            data = [x.lower() for x in data]
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path, output_hidden_states=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path, output_hidden_states=True)

    # Set the device to GPU (cuda) if available, otherwise stick with CPU
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    list_of_four_last_embeddings = []
    list_of_mean = []

    for l in data:
        # Convert the string "granola bars" to tokenized vocabulary IDs
        input_ids = tokenizer.encode(l)
        #print(input_ids)
        # Convert the list of IDs to a tensor of IDs
        input_ids = torch.LongTensor(input_ids)
        #print(input_ids)
        model = model.to(device)
        input_ids = input_ids.to(device)
        #print(input_ids)
        model.eval()

        # unsqueeze IDs to get batch size of 1 as added dimension
        input_ids = input_ids.unsqueeze(0)
        with torch.no_grad():
            out = model(input_ids=input_ids)

        # we only want the hidden_states
        if type_embedding == 'xlm':
            hidden_states = out[1]
        else:
            hidden_states = out[2]
        #mean of layers
        sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze()
        list_of_mean.append(sentence_embedding.tolist())

        # get last four layers
        last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1)

        # take the mean of the concatenated vector over the token dimension
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze()
        list_of_four_last_embeddings.append(cat_sentence_embedding.tolist())

    #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape)
    #print('list of mean', np.array(list_of_mean).shape)

    return list_of_mean, list_of_four_last_embeddings
Ejemplo n.º 27
0
def generate_embedding(objectives,
                       model_name,
                       batch_size=100,
                       output_attention=False):
    """
    Takes in a pandas dataframe and generates embeddings for the text column using the hugging face implemented models
    - Inputs:
        pd_dataframe (pandas dataframe): The dataframe containing all text column and their ids
        model_name (str): name of the model to be used for generating embeddings
        batch_size (int): batch size to use when generating embeddings for sentences

    - Output:
        sentence_embedding (tensor): tensor of shape n by 1024 where n is the number of sentence

    """

    if model_name == "bert":
        # Load pre-trained bert model (weights)
        model = BertModel.from_pretrained("bert-base-uncased",
                                          output_attentions=output_attention)
    elif model_name == "xlnet":
        # Load pre-trained xlnet model (weights)
        model = XLNetModel.from_pretrained("xlnet-base-cased",
                                           output_attentions=output_attention)
    elif model_name == "xlm":
        # Load pre-trained xlm model (weights)
        model = XLMModel.from_pretrained("xlm-mlm-en-2048",
                                         output_attentions=output_attention)
    elif model_name == "electra":
        # Load pre-trained electra model (weights)
        model = ElectraModel.from_pretrained(
            "google/electra-small-discriminator",
            output_attentions=output_attention)
    elif model_name == "albert":
        # Load pre-trained albert model (weights)
        model = AlbertForMaskedLM.from_pretrained(
            "albert-base-v2", output_attentions=output_attention)
    else:
        print(
            "Please select an implemented model name. {} doesn't exist".format(
                model_name))
        return

    sentences_per_batch = batch_size

    # setting up the device
    if torch.cuda.is_available():
        dev = "cuda:0"
    else:
        dev = "cpu"
    device = torch.device(dev)
    print("using ", device)

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    model.to(device)
    num_sentences = len(objectives)
    sentence_embedding = []
    attention_layers = None

    if num_sentences > sentences_per_batch:
        num_batches = num_sentences // sentences_per_batch

        for i in range(num_batches):
            start = i * sentences_per_batch
            end = (i + 1) * sentences_per_batch
            if i == num_batches - 1:
                end = num_sentences
            mini_objective = list(objectives[start:end])

            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([mini_objective]).squeeze()
            tokens_tensor = tokens_tensor.to(device)

            # Predict hidden states features for each layer
            with torch.no_grad():
                encoded_layers = model(tokens_tensor)

            # taking embeddings of the last layer.
            # token_vecs` is a tensor with shape [n x k x 1024]
            token_vecs = encoded_layers[0]

            # take the vector corresponing to the [CLS] token if it has a cls token.
            if model_name in ["bert", "albert", "electra"]:
                sentence_embedding += token_vecs[:, 0, :].tolist()
            # for those without a cls token, Calculate the average of all k  token vectors and adding to the main list
            else:
                sentence_embedding += torch.mean(token_vecs, dim=1).tolist()
            if output_attention is True:
                attention_layer = [al.tolist() for al in encoded_layers[-1]]
                attention_layer = np.array(attention_layer)
                if len(attention_layers) == 0:
                    attention_layers = attention_layer
                else:
                    attention_layers = np.concatenate(
                        (attention_layers, attention_layer), axis=1)

            print("Embedding for batch {} out of {} batches Completed.".format(
                i, num_batches))
    else:
        # Convert inputs to PyTorch tensors

        tokens_tensor = torch.tensor([objectives]).squeeze()
        tokens_tensor = tokens_tensor.to(device)

        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers = model(tokens_tensor)

        # taking embeddings of the last layer.
        # token_vecs` is a tensor with shape [n x k x 1024]
        token_vecs = encoded_layers[0]

        # take the vector corresponing to the [CLS] token if it has a cls token.
        if model_name in ["bert", "albert", "electra"]:
            sentence_embedding = token_vecs[:, 0, :].tolist()
        # for those without a cls token, Calculate the average of all k  token vectors and adding to the main list
        else:
            sentence_embedding = torch.mean(token_vecs, dim=1).tolist()

        if output_attention is True:
            attention_layers = [al.tolist() for al in encoded_layers[-1]]
            attention_layers = np.array(attention_layers)

    print(
        "Our final sentence embedding vector of shape:",
        len(sentence_embedding),
        len(sentence_embedding[0]),
    )
    if output_attention:
        print("And the corresponding attention vector of shape:",
              attention_layers.shape)
    return sentence_embedding, attention_layers
Ejemplo n.º 28
0
class XLMEncoder(MetaModule):
    """XLM model using Hugging Face's transformers library.

    The following command was used to fine-tune XLM on the in-domain data (retrieved
    from .pth file)::

        python train.py --exp_name tlm_clm --dump_path './dumped/' \
            --data_path '/mnt/shared/datasets/kiwi/parallel/en_de_indomain' \
            --lgs 'ar-bg-de-el-en-es-fr-hi-ru-sw-th-tr-ur-vi-zh' \
            --clm_steps 'en-de,de-en' --mlm_steps 'en-de,de-en' \
            --reload_model 'models/mlm_tlm_xnli15_1024.pth' --encoder_only True \
            --emb_dim 1024 --n_layers 12 --n_heads 8 --dropout '0.1' \
            --attention_dropout '0.1' --gelu_activation true --batch_size 32 \
            --bptt 256 --optimizer
            'adam_inverse_sqrt,beta1=0.9,beta2=0.98,lr=0.0001,weight_decay=0' \
            --epoch_size 200000 --validation_metrics _valid_mlm_ppl --max_vocab 95000 \
            --tokens_per_batch 1200 --exp_id "5114"

    Old version was converted using hf-transformers util method::

        convert_xlm_checkpoint_to_pytorch(
            self.config.model_name / 'indomain.pth',
            self.config.model_name / 'finetuned_wmt_en-de'
        )

    Old settings in QE not really used for the best run and submission:

    .. code-block:: yaml

        fb-causal-lambda: 0.0
        fb-keep-prob: 0.1
        fb-mask-prob: 0.8
        fb-model: data/trained_models/fb_pretrain/xnli/indomain.pth
        fb-pred-prob: 0.15
        fb-rand-prob: 0.1
        fb-src-lang: en
        fb-tgt-lang: de
        fb-tlm-lambda: 0.0
        fb-vocab: data/trained_models/fb_pretrain/xnli/vocab_xnli_15.txt

    """
    class Config(BaseConfig):
        encode_source: bool = False

        model_name: Union[str, Path] = 'xlm-mlm-tlm-xnli15-1024'
        """Pre-trained XLM model to use."""

        source_language: str = 'en'
        target_language: str = 'de'

        use_mismatch_features: bool = False
        """Use Alibaba's mismatch features."""

        use_predictor_features: bool = False
        """Use features originally proposed in the Predictor model."""

        interleave_input: bool = False
        """Concatenate SOURCE and TARGET without internal padding
        (111222000 instead of 111002220)"""

        freeze: bool = False
        """Freeze XLM during training."""

        use_mlp: bool = True
        """Apply a linear layer on top of XLM."""

        hidden_size: int = 100
        """Size of the linear layer on top of XLM."""
        @validator('model_name', pre=True)
        def fix_relative_path(cls, v):
            if v not in XLM_PRETRAINED_MODEL_ARCHIVE_LIST:
                v = Path(v)
                if not v.is_absolute():
                    v = Path.cwd().joinpath(v)
            return v

        @validator('use_mismatch_features', 'use_predictor_features', pre=True)
        def no_implementation(cls, v):
            if v:
                raise NotImplementedError('Not yet implemented')
            return False

    def __init__(self,
                 vocabs: Dict[str, Vocabulary],
                 config: Config,
                 pre_load_model: bool = True):
        super().__init__(config=config)

        if pre_load_model:
            self.xlm = XLMModel.from_pretrained(self.config.model_name,
                                                output_hidden_states=True)
        else:
            xlm_config = XLMConfig.from_pretrained(self.config.model_name,
                                                   output_hidden_states=True)
            self.xlm = XLMModel(xlm_config)

        self.source_lang_id = self.xlm.config.lang2id.get(
            self.config.source_language)
        self.target_lang_id = self.xlm.config.lang2id.get(
            self.config.target_language)

        if None in (self.source_lang_id, self.target_lang_id):
            raise ValueError(
                f'Invalid lang_id for XLM model.'
                f' Valid ids are: {self.xlm.config.lang2id.keys()}')

        self.mlp = None
        if self.config.use_mlp:
            self.mlp = nn.Sequential(
                nn.Linear(self.xlm.config.hidden_size,
                          self.config.hidden_size),
                nn.Tanh(),
            )
            output_size = self.config.hidden_size
        else:
            output_size = self.xlm.config.hidden_size

        self._sizes = {
            const.TARGET: output_size,
            const.TARGET_LOGITS: output_size,
            const.TARGET_SENTENCE: 2 * output_size,
            const.SOURCE: output_size,
            const.SOURCE_LOGITS: output_size,
        }

        self.vocabs = {
            const.TARGET: vocabs[const.TARGET],
            const.SOURCE: vocabs[const.SOURCE],
        }

        self.output_embeddings = self.xlm.embeddings

        if self.config.freeze:
            for param in self.xlm.parameters():
                param.requires_grad = False

    def load_state_dict(
        self,
        state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]],
        strict: bool = True,
    ):
        try:
            keys = super().load_state_dict(state_dict, strict)
        except RuntimeError as e:
            if "position_ids" in str(e):
                # FIXME: hack to get around Transformers 3.1 breaking changes
                # https://github.com/huggingface/transformers/issues/6882
                self.xlm.embeddings._non_persistent_buffers_set.add(
                    'position_ids')
                keys = super().load_state_dict(state_dict, strict)
                self.xlm.embeddings._non_persistent_buffers_set.discard(
                    'position_ids')
            else:
                raise e
        return keys

    @classmethod
    def input_data_encoders(cls, config: Config):
        return {
            const.SOURCE: XLMTextEncoder(tokenizer_name=config.model_name),
            const.TARGET: XLMTextEncoder(tokenizer_name=config.model_name),
        }

    def size(self, field=None):
        if field:
            return self._sizes[field]
        return self._sizes

    def forward(
        self,
        batch_inputs,
        *args,
        include_target_logits=False,
        include_source_logits=False,
    ):
        # XLM gets it's input as a concatenation of both embeddings
        # or as an interleave of inputs
        if self.config.interleave_input:
            merge_input_fn = self.interleave_input
        else:
            merge_input_fn = self.concat_input

        input_ids, _, attention_mask, position_ids, lang_ids = merge_input_fn(
            batch_a=batch_inputs[const.SOURCE],
            batch_b=batch_inputs[const.TARGET],
            pad_id=self.vocabs[const.TARGET].pad_id,
            lang_a=self.source_lang_id,
            lang_b=self.target_lang_id,
        )

        # encoded_layers also includes the embedding layer
        # encoded_layers[-1] is the last layer
        last_layer, encoded_layers = self.xlm(
            input_ids=input_ids,
            attention_mask=attention_mask,
            langs=lang_ids,
            position_ids=position_ids,
        )

        # TODO: select one of these strategies via cli
        # TODO: get a BETTER strategy
        # features = sum(encoded_layers[-5:-1])
        # features = encoded_layers[-2]
        features = last_layer

        if self.config.use_mlp:
            features = self.mlp(features)

        # Build the feature dictionary to be returned to the system
        output_features = self.split_outputs(
            features,
            batch_inputs,
            interleaved=self.config.interleave_input,
            label_a=const.SOURCE,
            label_b=const.TARGET,
        )

        # Convert pieces to tokens
        output_features[const.TARGET] = pieces_to_tokens(
            output_features[const.TARGET], batch_inputs[const.TARGET])
        output_features[const.SOURCE] = pieces_to_tokens(
            output_features[const.SOURCE], batch_inputs[const.SOURCE])
        source_len = batch_inputs[const.SOURCE].bounds_lengths
        target_len = batch_inputs[const.TARGET].bounds_lengths

        # NOTE: assuming here that features is already split into target and source
        source_features = output_features[const.SOURCE]
        target_features = output_features[const.TARGET]

        # Sentence-level features
        sentence_target_features = target_features[:, 0].unsqueeze(
            1) + select_positions(target_features,
                                  (target_len - 1).unsqueeze(1))
        sentence_source_features = source_features[:, 0].unsqueeze(
            1) + select_positions(source_features,
                                  (source_len - 1).unsqueeze(1))
        sentence_features = torch.cat(
            (sentence_target_features, sentence_source_features), dim=-1)

        output_features[const.TARGET_SENTENCE] = sentence_features
        output_features[const.TARGET] = target_features
        output_features[const.SOURCE] = source_features

        # Logits for multi-task fine-tuning
        if include_target_logits:
            output_features[const.TARGET_LOGITS] = torch.einsum(
                'vh,bsh->bsv',
                self.output_embeddings.weight,
                output_features[const.TARGET],
            )
        if include_source_logits:
            output_features[const.SOURCE_LOGITS] = torch.einsum(
                'vh,bsh->bsv',
                self.output_embeddings.weight,
                output_features[const.SOURCE],
            )

        # Additional features
        if self.config.use_mismatch_features:
            raise NotImplementedError

        return output_features

    @staticmethod
    def concat_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None):
        """Concatenate tensors of two batches into one tensor.

        Return:
            the concatenation, a mask of types (a as zeroes and b as ones)
                and concatenation of attention_mask.
        """
        ids_a = batch_a.tensor
        ids_b = batch_b.tensor
        attention_mask_a = retrieve_tokens_mask(batch_a)
        attention_mask_b = retrieve_tokens_mask(batch_b)
        types_a = torch.zeros_like(ids_a)
        types_b = torch.ones_like(ids_b)
        position_ids_a = torch.arange(ids_a.size(1),
                                      dtype=torch.long,
                                      device=ids_a.device)
        position_ids_a = position_ids_a.unsqueeze(0).expand(ids_a.size())
        position_ids_b = torch.arange(ids_b.size(1),
                                      dtype=torch.long,
                                      device=ids_b.device)
        position_ids_b = position_ids_b.unsqueeze(0).expand(ids_b.size())

        input_ids = torch.cat((ids_a, ids_b), dim=1)
        token_type_ids = torch.cat((types_a, types_b), dim=1)
        attention_mask = torch.cat((attention_mask_a, attention_mask_b), dim=1)
        position_ids = torch.cat((position_ids_a, position_ids_b), dim=1)

        if lang_a is not None and lang_b is not None:
            lang_id_a = torch.ones_like(ids_a) * lang_a
            lang_id_b = torch.ones_like(ids_b) * lang_b
            lang_ids = torch.cat((lang_id_a, lang_id_b), dim=1)
            # lang_ids *= attention_mask.unsqueeze(-1).to(lang_ids.dtype)
            lang_ids *= attention_mask.to(lang_ids.dtype)

            return input_ids, token_type_ids, attention_mask, position_ids, lang_ids

        return input_ids, token_type_ids, attention_mask, position_ids

    @staticmethod
    def interleave_input(batch_a, batch_b, pad_id, lang_a=None, lang_b=None):
        """Interleave the source + target embeddings into one tensor.

        This means making the input as [batch, target [SEP] source].

        Return:
            interleave of embds, mask of target (as zeroes) and source (as ones)
                and concatenation of attention_mask.
        """
        ids_a = batch_a.tensor
        ids_b = batch_b.tensor

        batch_size = ids_a.size(0)

        lengths_a = batch_a.lengths
        lengths_b = batch_b.lengths

        # max_pair_length = ids_a.size(1) + ids_b.size(1)
        max_pair_length = lengths_a + lengths_b

        input_ids = torch.full(
            (batch_size, max_pair_length),
            pad_id,
            dtype=ids_a.dtype,
            device=ids_a.device,
        )
        token_type_ids = torch.zeros_like(input_ids)
        attention_mask = torch.zeros_like(input_ids)

        for i in range(batch_size):
            # <s> and </s> are included in the mask (=1)
            len_a = lengths_a[i].item()
            len_b = lengths_b[i].item()

            input_ids[i, :len_b] = ids_b[i, :len_b]
            token_type_ids[i, :len_b] = 0
            attention_mask[i, :len_b] = 1

            input_ids[i, len_b:len_b + len_a] = ids_a[i, :len_a]
            token_type_ids[i, len_b:len_b + len_a] = 1
            attention_mask[i, len_b:len_b + len_a] = 1

        # TODO, why is attention mask 1 for all positions?
        return input_ids, token_type_ids, attention_mask

    @staticmethod
    def split_outputs(
        features: torch.Tensor,
        batch_inputs,
        interleaved: bool = False,
        label_a: str = const.SOURCE,
        label_b: str = const.TARGET,
    ):
        """Split contexts to get tag_side outputs.

        Arguments:
            features (tensor): XLM output: <s> source </s> </s> target </s>
                Shape of (bs, 1 + source_len + 2 + target_len + 1, 2)
            batch_inputs:
            interleaved (bool): whether the concat strategy was 'interleaved'.
            label_a: dictionary key for sequence A in ``features``.
            label_b: dictionary key for sequence B in ``features``.

        Return:
            dict of tensors, one per tag side.
        """
        outputs = OrderedDict()

        if interleaved:
            raise NotImplementedError('interleaving not supported.')
            # TODO: fix code below to use the lengths information and not bounds
            # if interleaved, shift each source sample by its correspondent length
            lengths_a = batch_inputs[const.TARGET].lengths
            shift = lengths_a.unsqueeze(-1)

            range_vector = torch.arange(features.size(0),
                                        device=features.device).unsqueeze(1)

            target_bounds = batch_inputs[const.TARGET].bounds
            features_a = features[range_vector, target_bounds]
            # Shift bounds by target length and preserve padding
            source_bounds = batch_inputs[const.SOURCE].bounds
            m = (source_bounds !=
                 -1).long()  # for masking out padding (which is -1)
            shifted_bounds = (source_bounds + shift) * m + source_bounds * (1 -
                                                                            m)
            features_b = features[range_vector, shifted_bounds]
        else:
            # otherwise, shift all by max_length
            lengths_a = batch_inputs[label_a].lengths
            # if we'd like to maintain the word pieces we merely select all
            features_a = features[:, :lengths_a.max()]
            features_b = features[:, lengths_a.max():]

        outputs[label_a] = features_a
        outputs[label_b] = features_b

        return outputs
Ejemplo n.º 29
0
def tokenizer_and_model(type_embedding):
    #########
    #PORTUGUESE
    #########
    if type_embedding.split('_')[0] == 'BERT' or type_embedding.split(
            '_')[0] == 'bert':
        if type_embedding == 'BERT_portuguese_large_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/'
        elif type_embedding == 'BERT_portuguese_base_neural_mind':
            path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/'
        elif type_embedding == 'bert_base_multilingual_cased':
            path = 'bert-base-multilingual-cased'
        elif type_embedding == 'bert_base_multilingual_uncased':
            path = 'bert-base-multilingual-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
        special_tokens_dict = {
            'additional_special_tokens': ['[USER]', '[SYSTEM]']
        }
        orig_num_tokens = len(tokenizer)
        num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
        total_num_tokens = orig_num_tokens + num_added_tokens
        model.resize_token_embeddings(total_num_tokens)
    elif type_embedding.split('_')[0] == 'xlmroberta':
        if type_embedding == 'xlmroberta_base':
            path = 'xlm-roberta-base'
        elif type_embedding == 'xlmroberta_large':
            path = 'xlm-roberta-large'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'xlm':
        path = 'xlm-mlm-100-1280'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    #########
    #ENGLISH
    #########
    elif type_embedding == 'en_bert_base_uncased':
        path = 'bert-base-uncased'
        #load tokenizer and model
        tokenizer = BertTokenizer.from_pretrained(path)
        model = BertModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)
    elif type_embedding == 'en_xlm_mlm_enfr_1024':
        path = 'xlm-mlm-enfr-1024'
        #load tokenizer and model
        tokenizer = XLMTokenizer.from_pretrained(path)
        model = XLMModel.from_pretrained(path,
                                         output_hidden_states=True,
                                         return_dict=True)
    elif type_embedding == 'en_xlm_roberta_base':
        path = 'xlm-roberta-base'
        #load tokenizer and model
        tokenizer = XLMRobertaTokenizer.from_pretrained(path)
        model = XLMRobertaModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'distilbert_base_cased':
        path = 'distilbert-base-cased'
        #load tokenizer and model
        tokenizer = DistilBertTokenizer.from_pretrained(path)
        model = DistilBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Mobile_Bert':
        path = 'google/mobilebert-uncased'
        #load tokenizer and model
        tokenizer = MobileBertTokenizer.from_pretrained(path)
        model = MobileBertModel.from_pretrained(path,
                                                output_hidden_states=True,
                                                return_dict=True)
    elif type_embedding == 'Electra':
        path = 'google/electra-small-discriminator'
        #load tokenizer and model
        tokenizer = ElectraTokenizer.from_pretrained(path)
        model = ElectraModel.from_pretrained(path,
                                             output_hidden_states=True,
                                             return_dict=True)
    elif type_embedding == 'BART':
        path = 'facebook/bart-large'
        #load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(path)
        model = BartModel.from_pretrained(path,
                                          output_hidden_states=True,
                                          return_dict=True)

    return tokenizer, model