Beispiel #1
0
 def create_and_check_xlm_model(self, config, input_ids, token_type_ids,
                                input_lengths, sequence_labels,
                                token_labels, is_impossible_labels,
                                input_mask):
     model = XLMModel(config=config)
     model.eval()
     outputs = model(input_ids,
                     lengths=input_lengths,
                     langs=token_type_ids)
     outputs = model(input_ids, langs=token_type_ids)
     outputs = model(input_ids)
     sequence_output = outputs[0]
     result = {
         "sequence_output": sequence_output,
     }
     self.parent.assertListEqual(
         list(result["sequence_output"].size()),
         [self.batch_size, self.seq_length, self.hidden_size])
Beispiel #2
0
    def __init__(self,
                 bert_config: str,
                 requires_grad: bool = False,
                 dropout: float = 0.1,
                 layer_dropout: float = 0.1,
                 combine_layers: str = "mix") -> None:
        #todo: control for XLM configs
        model = XLMModel(XLMConfig.from_json_file(bert_config))

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(xlm_model=model,
                         layer_dropout=layer_dropout,
                         combine_layers=combine_layers)

        self.model = model
        self.dropout = dropout
Beispiel #3
0
 def __init__(self,
              chunck_size=64,
              max_length=35,
              device=torch.device('cuda:0')):
     super(XLMClient, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
     self.max_length = max_length
     # load the model
     self.model = XLMModel.from_pretrained('xlm-mlm-en-2048')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
Beispiel #4
0
    def __init__(self,
                 pretrained_model: str,
                 requires_grad: bool = False,
                 dropout: float = 0.1,
                 layer_dropout: float = 0.1,
                 add_lang: bool = False,
                 combine_layers: str = "mix") -> None:
        model = XLMModel.from_pretrained(pretrained_model,
                                         output_hidden_states=True,
                                         dropout=dropout,
                                         attention_dropout=dropout)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(xlm_model=model,
                         layer_dropout=layer_dropout,
                         combine_layers=combine_layers,
                         add_lang=add_lang)

        self.model = model
        self.dropout = dropout
Beispiel #5
0
def test_xlm_embeddings():
    xlm_model: str = "xlm-mlm-en-2048"

    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #    0      1             2           3            4          5         6         7         8       9      10        11       12         13        14
    #
    #   <s>  'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s>
    #           |             |           |            |          |         |         |         \      |      /          |         |          |
    #         Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #           0             1           2            3          4         5          6               7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLMEmbeddings(
            model=xlm_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * model.embeddings.embedding_dim
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * model.embeddings.embedding_dim
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Beispiel #6
0
 def test_model_from_pretrained(self):
     cache_dir = "/tmp/pytorch_transformers_test/"
     for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(model)
Beispiel #7
0
def test_xlm_embeddings():
    xlm_model = 'xlm-mlm-en-2048'
    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()
    s = 'Berlin and Munich have a lot of puppeteer to see .'
    with torch.no_grad():
        tokens = tokenizer.tokenize((('<s>' + s) + '</s>'))
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)
        hidden_states = model(tokens_tensor)[(-1)]
        first_layer = hidden_states[1][0]
    assert (len(first_layer) == len(tokens))

    def embed_sentence(sentence: str,
                       pooling_operation,
                       layers: str = '1',
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model,
                                   layers=layers,
                                   pooling_operation=pooling_operation,
                                   use_scalar_mix=use_scalar_mix)
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)
        return flair_sentence

    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation='first')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation='last')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation='first_last')
    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation='mean')
    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()
    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)
    sentence_mult_layers = embed_sentence(sentence='Munich',
                                          pooling_operation='first',
                                          layers='1,2,3,4')
    ref_embedding_size = (4 * model.embeddings.embedding_dim)
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
    sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin',
                                                     pooling_operation='first',
                                                     layers='1,2,3,4',
                                                     use_scalar_mix=True)
    ref_embedding_size = (1 * model.embeddings.embedding_dim)
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)