Exemple #1
0
        def create_and_check_xlnet_base_model(self, config, input_ids_1,
                                              input_ids_2, input_ids_q,
                                              perm_mask, input_mask,
                                              target_mapping, segment_ids,
                                              lm_labels, sequence_labels,
                                              is_impossible_labels):
            model = XLNetModel(config)
            model.eval()

            _, _ = model(input_ids_1, input_mask=input_mask)
            _, _ = model(input_ids_1, attention_mask=input_mask)
            _, _ = model(input_ids_1, token_type_ids=segment_ids)
            outputs, mems_1 = model(input_ids_1)

            result = {
                "mems_1": mems_1,
                "outputs": outputs,
            }

            self.parent.assertListEqual(
                list(result["outputs"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] *
                self.num_hidden_layers)
Exemple #2
0
 def __init__(self, temp_dir, load_pretrained, xlnet_config=None):
     super(XLNet, self).__init__()
     if load_pretrained:
         self.model = XLNetModel.from_pretrained('xlnet-base-cased',
                                                 cache_dir=temp_dir)
     else:
         self.model = XLNetModel(xlnet_config)
    def __init__(self, large, temp_dir, finetune=False):
        super(XLNet, self).__init__()
        if (large):
            self.model = XLNetModel.from_pretrained('xlnet-large-cased',
                                                    cache_dir=temp_dir)
        else:
            self.model = XLNetModel.from_pretrained('xlnet-base-cased',
                                                    cache_dir=temp_dir)

        self.finetune = finetune
Exemple #4
0
    def __init__(
            self,
            config: XLNetConfig,
            sentence_transformer_config: SentenceTransformerConfig = None):
        XLNetPreTrainedModel.__init__(self, config)

        self.model_config = config
        self.model_hidden_size = config.d_model

        TransformerModel.__init__(self, sentence_transformer_config)

        self.transformer = XLNetModel(config)
        self.apply(self.init_weights)

        ##Code from summary
        self.summary = nn.Identity()
        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
            self.summary = nn.Linear(config.d_model, config.d_model)

        self.activation = nn.Identity()
        if hasattr(
                config,
                'summary_activation') and config.summary_activation == 'tanh':
            self.activation = nn.Tanh()

        self.first_dropout = nn.Identity()
        if hasattr(
                config,
                'summary_first_dropout') and config.summary_first_dropout > 0:
            self.first_dropout = nn.Dropout(config.summary_first_dropout)

        self.last_dropout = nn.Identity()
        if hasattr(config,
                   'summary_last_dropout') and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)
    def __init__(
        self,
        pretrained_model_name_or_path: str = "xlnet-large-cased",
        layers: str = "1",
        pooling_operation: str = "first_last",
        use_scalar_mix: bool = False,
    ):
        """XLNet embeddings, as proposed in Yang et al., 2019.
        :param pretrained_model_name_or_path: name or path of XLNet model
        :param layers: comma-separated list of layers
        :param pooling_operation: defines pooling operation for subwords
        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
        """
        super().__init__()

        self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = XLNetModel.from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True
        )
        self.name = pretrained_model_name_or_path
        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
        self.pooling_operation = pooling_operation
        self.use_scalar_mix = use_scalar_mix
        self.static_embeddings = True

        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token("hello"))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(
            embedded_dummy[0].get_token(1).get_embedding()
        )
Exemple #6
0
    def __init__(self, config):
        super(XLNetNL2SQL, self).__init__(config)
        self.num_tag_labels = 2
        self.num_agg_labels = 6
        self.num_connection_labels = 3
        self.num_con_num_labels = 4
        self.num_type_labels = 3
        self.num_sel_num_labels = 4  # {1, 2, 3}
        self.num_where_num_labels = 5  # {1, 2, 3, 4}
        self.num_op_labels = 4
        self.hidden_size = config.hidden_size

        self.pretrain_model = XLNetModel(config)
        for p in self.parameters():
            p.requires_grad = False
        self.dropout = nn.Dropout(config.dropout)
        self.linear_tag = nn.Linear(self.hidden_size * 3, self.num_tag_labels)
        self.linear_agg = nn.Linear(self.hidden_size * 2, self.num_agg_labels)
        self.linear_connection = nn.Linear(self.hidden_size,
                                           self.num_connection_labels)
        self.linear_con_num = nn.Linear(self.hidden_size * 2,
                                        self.num_con_num_labels)
        self.linear_type = nn.Linear(self.hidden_size * 2,
                                     self.num_type_labels)
        self.linear_sel_num = nn.Linear(self.hidden_size,
                                        self.num_sel_num_labels)
        self.linear_where_num = nn.Linear(self.hidden_size,
                                          self.num_where_num_labels)
        self.values_attention = TableAttention(self.hidden_size,
                                               self.hidden_size)
        self.head_attention = ValueAttention(self.hidden_size,
                                             self.hidden_size)
        self.linear_op = nn.Linear(self.hidden_size * 2, self.num_op_labels)
    def reload(self, bert_model, gpu):
        from pytorch_transformers import XLNetTokenizer, XLNetModel
        if bert_model.endswith('.tar.gz'):
            self.tokenizer = NoPickle(
                XLNetTokenizer.from_pretrained(bert_model.replace(
                    '.tar.gz', '-vocab.txt'),
                                               do_lower_case=self.lower))
        else:
            self.tokenizer = NoPickle(
                XLNetTokenizer.from_pretrained(bert_model,
                                               do_lower_case=self.lower))

        self.xlnet = NoPickle(XLNetModel.from_pretrained(bert_model))
        if gpu:
            self.xlnet = self.xlnet.cuda()
        self.output_dim = self.xlnet.d_model
        # self.max_len = self.xlnet.embeddings.position_embeddings.num_embeddings

        for p in self.xlnet.parameters():
            p.requires_grad = False

        if self.finetune_tune_last_n > 0:
            self.finetune_layers = self.xlnet.encoder.layer[
                -self.finetune_tune_last_n:]
            for p in self.finetune_layers.parameters():
                p.requires_grad = True
def main():
    torch.cuda.empty_cache()
    parser = setup_parser()
    args = parser.parse_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory already exists and is not empty.")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    set_seed(args)
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: {}".format(args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    ##Load Models
    config = XLNetConfig.from_pretrained(args.config_name)
    print('config: {}'.format(config))
    tokenizer = XLNetTokenizer.from_pretrained(
        args.text_encoder_checkpoint, do_lower_case=args.do_lower_case)
    text_encoder = XLNetModel.from_pretrained(args.text_encoder_checkpoint,
                                              config=config)
    graph_encoder = GraphEncoder(args.n_hidden, args.min_score)
    if args.graph_encoder_checkpoint:
        graph_encoder.gcnnet.load_state_dict(
            torch.load(args.graph_encoder_checkpoint))

    medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1)
    medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5)
    medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden,
                                            4)
    model = MedstsNet(text_encoder, graph_encoder, medsts_classifier,
                      medsts_c_classifier, medsts_type_classifier, config)
    model.to(args.device)

    args.n_gpu = 1

    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                args.task_name,
                                                tokenizer,
                                                evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info('global step = {}, average loss = {}'.format(
            global_step, tr_loss))
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        logger.info("saving model checkpoint to {}".format(args.output_dir))
        model_to_save = model.module if hasattr(model, 'module') else model
        # model_to_save.save_pretrained(args.output_dir)
        torch.save(model_to_save.state_dict(),
                   os.path.join(args.output_dir, 'saved_model.pth'))
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
    def __init__(
        self,
        gpu=-1,
        check_for_lowercase=True,
        embeddings_dim=0,
        verbose=True,
        path_to_pretrained="xlnet-base-cased",
        model_frozen=True,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
    ):
        SeqIndexerBaseEmbeddings.__init__(
            self,
            gpu=gpu,
            check_for_lowercase=check_for_lowercase,
            zero_digits=True,
            bos_token=bos_token,
            eos_token=eos_token,
            pad=pad_token,
            unk=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            mask_token=mask_token,
            load_embeddings=True,
            embeddings_dim=embeddings_dim,
            verbose=verbose,
            isBert=False,
            isXlNet=True)

        print("create seq indexer Transformers from Model {}".format(
            path_to_pretrained))

        self.xlnet = True

        self.path_to_pretrained = path_to_pretrained
        self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained)
        self.config = XLNetConfig.from_pretrained(path_to_pretrained)
        self.emb = XLNetModel.from_pretrained(path_to_pretrained)
        self.frozen = model_frozen
        for param in self.emb.parameters():
            param.requires_grad = False
        for elem in [
                self.emb.word_embedding, self.emb.layer, self.emb.dropout
        ]:
            for param in elem.parameters():
                param.requires_grad = False

        if (not self.frozen):
            for param in self.emb.pooler.parameters():
                param.requires_grad = True
        self.emb.eval()
        print("XLNET model loaded succesifully")
Exemple #10
0
    def __init__(self, config, num_choices=1, num_docs_rank=30):
        super(XLNetForMultipleChoice, self).__init__(config)

        self.num_choices = num_choices
        self.transformer = XLNetModel(config)
        self.sequence_summary = SequenceSummary(config)
        self.logits_proj = nn.Linear(config.d_model, num_choices)

        self.apply(self.init_weights)
    def __init__(self, config):
        super(XLNetForTokenClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.transformer = XLNetModel(config)
        self.logits_proj = torch.nn.Linear(config.d_model, config.num_labels)
        self.dropout = torch.nn.Dropout(config.dropout)

        self.apply(self.init_weights)
    def __init__(self, config):
        super(XLNetForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.transformer = XLNetModel(config)
        self.sequence_summary = SequenceSummary(config)
        self.logits_proj = torch.nn.Linear(config.d_model, config.num_labels)

        self.apply(self.init_weights)
Exemple #13
0
    def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False):
        super(XLNet, self).__init__()
        self.config_keys = ['max_seq_length', 'do_lower_case']
        self.max_seq_length = max_seq_length
        self.do_lower_case = do_lower_case

        self.xlnet = XLNetModel.from_pretrained(model_name_or_path)
        self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
        self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
        self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
Exemple #14
0
    def __init__(self, model_path):
        super(OnmtXLNetEncoder, self).__init__()
        config = XLNetConfig.from_json_file(
            os.path.join(model_path, "config.json"))
        pretrained_dict = os.path.join(model_path, "pytorch_model.bin")
        if os.path.exists(pretrained_dict):
            model = XLNetModel.from_pretrained(
                pretrained_model_name_or_path=pretrained_dict, config=config)
            print("init XLNet model with {} weights".format(
                len(model.state_dict())))
        else:
            model = XLNetModel(config)

        model.word_embedding = expandEmbeddingByN(model.word_embedding, 4)
        model.word_embedding = expandEmbeddingByN(model.word_embedding,
                                                  2,
                                                  last=True)
        self.encoder = model
        #print(model)
        print("***" * 20)
Exemple #15
0
    def load(cls,
             config_path: Path,
             model_path: Path,
             cache_model: bool = True) -> XLNetModel:
        if model_path in cls._cache:
            return PretrainedXLNetModel._cache[str(model_path)]

        config = XLNetConfig.from_pretrained(str(config_path))
        model = XLNetModel.from_pretrained(str(model_path), config=config)
        if cache_model:
            cls._cache[str(model_path)] = model

        return model
 def __init__(self,
              chunck_size=64,
              max_length=35,
              device=torch.device('cuda:0')):
     super(XLNetClient, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
     self.max_length = max_length
     # load the model
     self.model = XLNetModel.from_pretrained('xlnet-large-cased')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
Exemple #17
0
    def __init__(self, opt):
        self.opt = opt
        if 'roberta' in opt.pretrained_bert_name:
            tokenizer = RobertaTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = RobertaModel.from_pretrained(
                opt.pretrained_bert_name, output_attentions=True)
        elif 'bert' in opt.pretrained_bert_name:
            tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name)
            transformer = BertModel.from_pretrained(opt.pretrained_bert_name,
                                                    output_attentions=True)
        elif 'xlnet' in opt.pretrained_bert_name:
            tokenizer = XLNetTokenizer.from_pretrained(
                opt.pretrained_bert_name)
            transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name,
                                                     output_attentions=True)
        if 'bert' or 'xlnet' in opt.model_name:
            tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
            self.model = opt.model_class(transformer, opt).to(opt.device)
        # elif 'xlnet' in opt.model_name:
        #     tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len)
        #     self.model = opt.model_class(bert,opt).to(opt.device)
        else:
            tokenizer = build_tokenizer(
                fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
                max_seq_len=opt.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
            embedding_matrix = build_embedding_matrix(
                word2idx=tokenizer.word2idx,
                embed_dim=opt.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(
                    str(opt.embed_dim), opt.dataset))
            self.model = opt.model_class(embedding_matrix, opt).to(opt.device)

        self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer)
        self.testset = ABSADataset(opt.dataset_file['test'], tokenizer)
        assert 0 <= opt.valset_ratio < 1
        if opt.valset_ratio > 0:
            valset_len = int(len(self.trainset) * opt.valset_ratio)
            self.trainset, self.valset = random_split(
                self.trainset, (len(self.trainset) - valset_len, valset_len))
        else:
            self.valset = self.testset

        if opt.device.type == 'cuda':
            logger.info('cuda memory allocated: {}'.format(
                torch.cuda.memory_allocated(device=opt.device.index)))
        self._print_args()
Exemple #18
0
    def __init__(self, args, dictionary, embed_tokens, left_pad=True):
        super().__init__(dictionary)
        self.dropout = args.dropout
        self.n_gpu = torch.cuda.device_count()

        print('Distributed rank: ', args.distributed_rank)
        print('Number of used GPU: ', self.n_gpu)

        # if self.n_gpu > 1:
        #     torch.distributed.barrier()
        if args.distributed_rank not in [-1, 0]:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training will download model & vocab

        # Load pre-trained model (weights)
        config = XLNetConfig.from_pretrained(args.xlnet_model)
        self.xlnet = XLNetModel.from_pretrained(args.xlnet_model,
                                                config=config)

        if args.distributed_rank == 0:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training will download model & vocab

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx
        self.max_source_positions = args.max_source_positions

        self.embed_tokens = embed_tokens
        self.embed_scale = math.sqrt(embed_dim)
        self.embed_positions = PositionalEmbedding(
            args.max_source_positions,
            embed_dim,
            self.padding_idx,
            left_pad=left_pad,
            learned=args.encoder_learned_pos,
        ) if not args.no_token_positional_embeddings else None

        self.layers = nn.ModuleList([])
        self.layers.extend([
            TransformerEncoderLayer(args) for i in range(args.encoder_layers)
        ])
        self.register_buffer('version', torch.Tensor([2]))
        self.normalize = args.encoder_normalize_before
        if self.normalize:
            self.layer_norm = LayerNorm(embed_dim)
Exemple #19
0
 def __init__(self, config):
     super(XLNet_SenAnalysis, self).__init__(config)
     self.num_labels = config.num_labels
     self.dropout = nn.Dropout(args.lstm_dropout)
     self.transformer = XLNetModel(config)
     self.sequence_summary = SequenceSummary(config)
     self.logits_proj = nn.Linear(args.lstm_hidden_size * 2,
                                  config.num_labels)
     self.W = []
     self.gru = []
     for i in range(args.lstm_layers):
         self.W.append(
             nn.Linear(args.lstm_hidden_size * 2,
                       args.lstm_hidden_size * 2))
         self.gru.append(nn.GRU(config.hidden_size if i ==0 else args.lstm_hidden_size*4,\
                                args.lstm_hidden_size, num_layers=1, bidirectional = True,\
                                batch_first = True).cuda())
     self.W = nn.ModuleList(self.W)
     self.gru = nn.ModuleList(self.gru)
     self.init_weights()
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = XLNet(args.large, args.temp_dir, args.finetune_bert)

        self.ext_layer = ExtTransformerEncoder(
            self.bert.model.config.hidden_size, args.ext_ff_size,
            args.ext_heads, args.ext_dropout, args.ext_layers)
        if (args.encoder == 'baseline'):
            bert_config = XLNetConfig(self.bert.model.config.vocab_size,
                                      hidden_size=args.ext_hidden_size,
                                      num_hidden_layers=args.ext_layers,
                                      num_attention_heads=args.ext_heads,
                                      intermediate_size=args.ext_ff_size)
            self.bert.model = XLNetModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if (args.max_pos > 512):
            my_pos_embeddings = nn.Embedding(
                args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.bert.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)
 def __init__(self, num_labels=2, model_type='xlnet-base-cased',token_layer='token-cls',output_logits=True):
     super(XLNetForWSD, self).__init__()
     
     self.config = XLNetConfig()
     self.token_layer = token_layer
     self.num_labels = 2
     self.xlnet = XLNetModel.from_pretrained(model_type)
     self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
     self.output_logits = output_logits
     
     # Define which token selection layer to use
     if token_layer == 'token-cls':
         self.tokenselectlayer = TokenClsLayer()
     elif token_layer in ['sent-cls','sent-cls-ws']:
         self.tokenselectlayer = SentClsLayer()
     else:
         raise ValueError("Unidentified parameter for token selection layer")       
              
     self.classifier = nn.Linear(768, num_labels)
     if not output_logits:
         self.softmax = nn.Softmax(dim=1) # to be checked!!!
     
     nn.init.xavier_normal_(self.classifier.weight)   
Exemple #22
0
def test_xlnet_embeddings():
    xlnet_model = 'xlnet-large-cased'
    tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)
    model = XLNetModel.from_pretrained(
        pretrained_model_name_or_path=xlnet_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()
    s = 'Berlin and Munich have a lot of puppeteer to see .'
    with torch.no_grad():
        tokens = tokenizer.tokenize((('<s>' + s) + '</s>'))
        print(tokens)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)
        hidden_states = model(tokens_tensor)[(-1)]
        first_layer = hidden_states[1][0]
    assert (len(first_layer) == len(tokens))

    def embed_sentence(sentence: str,
                       pooling_operation,
                       layers: str = '1',
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = XLNetEmbeddings(pretrained_model_name_or_path=xlnet_model,
                                     layers=layers,
                                     pooling_operation=pooling_operation,
                                     use_scalar_mix=use_scalar_mix)
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)
        return flair_sentence

    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation='first')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation='last')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation='first_last')
    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation='mean')
    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()
    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)
    sentence_mult_layers = embed_sentence(sentence='Munich',
                                          pooling_operation='first',
                                          layers='1,2,3,4')
    ref_embedding_size = (4 * model.d_model)
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
    sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin',
                                                     pooling_operation='first',
                                                     layers='1,2,3,4',
                                                     use_scalar_mix=True)
    ref_embedding_size = (1 * model.d_model)
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
Exemple #23
0
def test_xlnet_embeddings():
    xlnet_model: str = "xlnet-large-cased"

    tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)
    model = XLNetModel.from_pretrained(
        pretrained_model_name_or_path=xlnet_model, output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        print(tokens)

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #   0        1         2         3         4      5      6      7        8        9      10      11   12   13     14
    #
    # '<s>', '▁Berlin', '▁and', '▁Munich', '▁have', '▁a', '▁lot', '▁of', '▁puppet', 'eer', '▁to', '▁see', '▁', '.', '</s>'
    #           |          |         |         |      |      |      |         \      /       |       |     \    /
    #         Berlin      and     Munich     have     a     lot     of       puppeteer       to     see       .
    #
    #           0          1         2         3      4      5       6           7           8       9        10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLNetEmbeddings(
            model=xlnet_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[9]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * model.d_model
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * model.d_model
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
Exemple #24
0
'''
@Time   : 2019-06-16 11:34:23
@Author : su.zhu
@Desc   : 
'''

import torch
from pytorch_transformers import XLNetModel, XLNetTokenizer

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")])
last_hidden_states = model(input_ids)[0]

print(model.config)
print(last_hidden_states.size())

# Tokenized input
text_a = "Who was Jim Henson ?"
text_b = "Jim Henson was a puppeteer"
tokens_a = tokenizer.tokenize(text_a)
tokens_b = tokenizer.tokenize(text_b)
cls_token = '[CLS]'
sep_token = '[SEP]'
tokens = tokens_a + ['[SEP]']
segment_ids = [0] * len(tokens)
    def __init__(self,
                 args,
                 device,
                 checkpoint=None,
                 bert_from_extractive=None):
        super(AbsSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = XLNet(args.large, args.temp_dir, args.finetune_bert)

        if bert_from_extractive is not None:
            self.bert.model.load_state_dict(dict([
                (n[11:], p) for n, p in bert_from_extractive.items()
                if n.startswith('bert.model')
            ]),
                                            strict=True)

        if (args.encoder == 'baseline'):
            bert_config = XLNetConfig(
                self.bert.model.config.vocab_size,
                hidden_size=args.enc_hidden_size,
                num_hidden_layers=args.enc_layers,
                num_attention_heads=8,
                intermediate_size=args.enc_ff_size,
                hidden_dropout_prob=args.enc_dropout,
                attention_probs_dropout_prob=args.enc_dropout)
            self.bert.model = XLNetModel(bert_config)

        if (args.max_pos > 512):
            my_pos_embeddings = nn.Embedding(
                args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:
                                          512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[
                512:] = self.bert.model.embeddings.position_embeddings.weight.data[
                    -1][None, :].repeat(args.max_pos - 512, 1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings
        self.vocab_size = self.bert.model.config.vocab_size
        tgt_embeddings = nn.Embedding(self.vocab_size,
                                      self.bert.model.config.hidden_size,
                                      padding_idx=0)
        if (self.args.share_emb):
            tgt_embeddings.weight = copy.deepcopy(
                self.bert.model.embeddings.word_embeddings.weight)

        self.decoder = TransformerDecoder(self.args.dec_layers,
                                          self.args.dec_hidden_size,
                                          heads=self.args.dec_heads,
                                          d_ff=self.args.dec_ff_size,
                                          dropout=self.args.dec_dropout,
                                          embeddings=tgt_embeddings)

        self.generator = get_generator(self.vocab_size,
                                       self.args.dec_hidden_size, device)
        self.generator[0].weight = self.decoder.embeddings.weight

        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            for module in self.decoder.modules():
                if isinstance(module, (nn.Linear, nn.Embedding)):
                    module.weight.data.normal_(mean=0.0, std=0.02)
                elif isinstance(module, nn.LayerNorm):
                    module.bias.data.zero_()
                    module.weight.data.fill_(1.0)
                if isinstance(module, nn.Linear) and module.bias is not None:
                    module.bias.data.zero_()
            for p in self.generator.parameters():
                if p.dim() > 1:
                    xavier_uniform_(p)
                else:
                    p.data.zero_()
            if (args.use_bert_emb):
                tgt_embeddings = nn.Embedding(
                    self.vocab_size,
                    self.bert.model.config.hidden_size,
                    padding_idx=0)
                tgt_embeddings.weight = copy.deepcopy(
                    self.bert.model.word_embedding.weight)
                self.decoder.embeddings = tgt_embeddings
                self.generator[0].weight = self.decoder.embeddings.weight
        self.to(device)
Exemple #26
0
 def test_model_from_pretrained(self):
     cache_dir = "/tmp/pytorch_transformers_test/"
     for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(model)