def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): model = XLNetModel(config) model.eval() _, _ = model(input_ids_1, input_mask=input_mask) _, _ = model(input_ids_1, attention_mask=input_mask) _, _ = model(input_ids_1, token_type_ids=segment_ids) outputs, mems_1 = model(input_ids_1) result = { "mems_1": mems_1, "outputs": outputs, } self.parent.assertListEqual( list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
def __init__(self, temp_dir, load_pretrained, xlnet_config=None): super(XLNet, self).__init__() if load_pretrained: self.model = XLNetModel.from_pretrained('xlnet-base-cased', cache_dir=temp_dir) else: self.model = XLNetModel(xlnet_config)
def __init__(self, large, temp_dir, finetune=False): super(XLNet, self).__init__() if (large): self.model = XLNetModel.from_pretrained('xlnet-large-cased', cache_dir=temp_dir) else: self.model = XLNetModel.from_pretrained('xlnet-base-cased', cache_dir=temp_dir) self.finetune = finetune
def __init__( self, config: XLNetConfig, sentence_transformer_config: SentenceTransformerConfig = None): XLNetPreTrainedModel.__init__(self, config) self.model_config = config self.model_hidden_size = config.d_model TransformerModel.__init__(self, sentence_transformer_config) self.transformer = XLNetModel(config) self.apply(self.init_weights) ##Code from summary self.summary = nn.Identity() if hasattr(config, 'summary_use_proj') and config.summary_use_proj: self.summary = nn.Linear(config.d_model, config.d_model) self.activation = nn.Identity() if hasattr( config, 'summary_activation') and config.summary_activation == 'tanh': self.activation = nn.Tanh() self.first_dropout = nn.Identity() if hasattr( config, 'summary_first_dropout') and config.summary_first_dropout > 0: self.first_dropout = nn.Dropout(config.summary_first_dropout) self.last_dropout = nn.Identity() if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: self.last_dropout = nn.Dropout(config.summary_last_dropout)
def __init__( self, pretrained_model_name_or_path: str = "xlnet-large-cased", layers: str = "1", pooling_operation: str = "first_last", use_scalar_mix: bool = False, ): """XLNet embeddings, as proposed in Yang et al., 2019. :param pretrained_model_name_or_path: name or path of XLNet model :param layers: comma-separated list of layers :param pooling_operation: defines pooling operation for subwords :param use_scalar_mix: defines the usage of scalar mix for specified layer(s) """ super().__init__() self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path) self.model = XLNetModel.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True ) self.name = pretrained_model_name_or_path self.layers: List[int] = [int(layer) for layer in layers.split(",")] self.pooling_operation = pooling_operation self.use_scalar_mix = use_scalar_mix self.static_embeddings = True dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding() )
def __init__(self, config): super(XLNetNL2SQL, self).__init__(config) self.num_tag_labels = 2 self.num_agg_labels = 6 self.num_connection_labels = 3 self.num_con_num_labels = 4 self.num_type_labels = 3 self.num_sel_num_labels = 4 # {1, 2, 3} self.num_where_num_labels = 5 # {1, 2, 3, 4} self.num_op_labels = 4 self.hidden_size = config.hidden_size self.pretrain_model = XLNetModel(config) for p in self.parameters(): p.requires_grad = False self.dropout = nn.Dropout(config.dropout) self.linear_tag = nn.Linear(self.hidden_size * 3, self.num_tag_labels) self.linear_agg = nn.Linear(self.hidden_size * 2, self.num_agg_labels) self.linear_connection = nn.Linear(self.hidden_size, self.num_connection_labels) self.linear_con_num = nn.Linear(self.hidden_size * 2, self.num_con_num_labels) self.linear_type = nn.Linear(self.hidden_size * 2, self.num_type_labels) self.linear_sel_num = nn.Linear(self.hidden_size, self.num_sel_num_labels) self.linear_where_num = nn.Linear(self.hidden_size, self.num_where_num_labels) self.values_attention = TableAttention(self.hidden_size, self.hidden_size) self.head_attention = ValueAttention(self.hidden_size, self.hidden_size) self.linear_op = nn.Linear(self.hidden_size * 2, self.num_op_labels)
def reload(self, bert_model, gpu): from pytorch_transformers import XLNetTokenizer, XLNetModel if bert_model.endswith('.tar.gz'): self.tokenizer = NoPickle( XLNetTokenizer.from_pretrained(bert_model.replace( '.tar.gz', '-vocab.txt'), do_lower_case=self.lower)) else: self.tokenizer = NoPickle( XLNetTokenizer.from_pretrained(bert_model, do_lower_case=self.lower)) self.xlnet = NoPickle(XLNetModel.from_pretrained(bert_model)) if gpu: self.xlnet = self.xlnet.cuda() self.output_dim = self.xlnet.d_model # self.max_len = self.xlnet.embeddings.position_embeddings.num_embeddings for p in self.xlnet.parameters(): p.requires_grad = False if self.finetune_tune_last_n > 0: self.finetune_layers = self.xlnet.encoder.layer[ -self.finetune_tune_last_n:] for p in self.finetune_layers.parameters(): p.requires_grad = True
def main(): torch.cuda.empty_cache() parser = setup_parser() args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory already exists and is not empty.") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: {}".format(args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) ##Load Models config = XLNetConfig.from_pretrained(args.config_name) print('config: {}'.format(config)) tokenizer = XLNetTokenizer.from_pretrained( args.text_encoder_checkpoint, do_lower_case=args.do_lower_case) text_encoder = XLNetModel.from_pretrained(args.text_encoder_checkpoint, config=config) graph_encoder = GraphEncoder(args.n_hidden, args.min_score) if args.graph_encoder_checkpoint: graph_encoder.gcnnet.load_state_dict( torch.load(args.graph_encoder_checkpoint)) medsts_classifier = PairClassifier(config.hidden_size + args.n_hidden, 1) medsts_c_classifier = PairClassifier(config.hidden_size + args.n_hidden, 5) medsts_type_classifier = PairClassifier(config.hidden_size + args.n_hidden, 4) model = MedstsNet(text_encoder, graph_encoder, medsts_classifier, medsts_c_classifier, medsts_type_classifier, config) model.to(args.device) args.n_gpu = 1 if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info('global step = {}, average loss = {}'.format( global_step, tr_loss)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("saving model checkpoint to {}".format(args.output_dir)) model_to_save = model.module if hasattr(model, 'module') else model # model_to_save.save_pretrained(args.output_dir) torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, 'saved_model.pth')) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def __init__( self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, path_to_pretrained="xlnet-base-cased", model_frozen=True, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", ): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, bos_token=bos_token, eos_token=eos_token, pad=pad_token, unk=unk_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token, load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose, isBert=False, isXlNet=True) print("create seq indexer Transformers from Model {}".format( path_to_pretrained)) self.xlnet = True self.path_to_pretrained = path_to_pretrained self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained) self.config = XLNetConfig.from_pretrained(path_to_pretrained) self.emb = XLNetModel.from_pretrained(path_to_pretrained) self.frozen = model_frozen for param in self.emb.parameters(): param.requires_grad = False for elem in [ self.emb.word_embedding, self.emb.layer, self.emb.dropout ]: for param in elem.parameters(): param.requires_grad = False if (not self.frozen): for param in self.emb.pooler.parameters(): param.requires_grad = True self.emb.eval() print("XLNET model loaded succesifully")
def __init__(self, config, num_choices=1, num_docs_rank=30): super(XLNetForMultipleChoice, self).__init__(config) self.num_choices = num_choices self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(config.d_model, num_choices) self.apply(self.init_weights)
def __init__(self, config): super(XLNetForTokenClassification, self).__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.logits_proj = torch.nn.Linear(config.d_model, config.num_labels) self.dropout = torch.nn.Dropout(config.dropout) self.apply(self.init_weights)
def __init__(self, config): super(XLNetForMultiLabelSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = torch.nn.Linear(config.d_model, config.num_labels) self.apply(self.init_weights)
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False): super(XLNet, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.max_seq_length = max_seq_length self.do_lower_case = do_lower_case self.xlnet = XLNetModel.from_pretrained(model_name_or_path) self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
def __init__(self, model_path): super(OnmtXLNetEncoder, self).__init__() config = XLNetConfig.from_json_file( os.path.join(model_path, "config.json")) pretrained_dict = os.path.join(model_path, "pytorch_model.bin") if os.path.exists(pretrained_dict): model = XLNetModel.from_pretrained( pretrained_model_name_or_path=pretrained_dict, config=config) print("init XLNet model with {} weights".format( len(model.state_dict()))) else: model = XLNetModel(config) model.word_embedding = expandEmbeddingByN(model.word_embedding, 4) model.word_embedding = expandEmbeddingByN(model.word_embedding, 2, last=True) self.encoder = model #print(model) print("***" * 20)
def load(cls, config_path: Path, model_path: Path, cache_model: bool = True) -> XLNetModel: if model_path in cls._cache: return PretrainedXLNetModel._cache[str(model_path)] config = XLNetConfig.from_pretrained(str(config_path)) model = XLNetModel.from_pretrained(str(model_path), config=config) if cache_model: cls._cache[str(model_path)] = model return model
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(XLNetClient, self).__init__() self.chunck_size = chunck_size self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') self.max_length = max_length # load the model self.model = XLNetModel.from_pretrained('xlnet-large-cased') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self, opt): self.opt = opt if 'roberta' in opt.pretrained_bert_name: tokenizer = RobertaTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = RobertaModel.from_pretrained( opt.pretrained_bert_name, output_attentions=True) elif 'bert' in opt.pretrained_bert_name: tokenizer = BertTokenizer.from_pretrained(opt.pretrained_bert_name) transformer = BertModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) elif 'xlnet' in opt.pretrained_bert_name: tokenizer = XLNetTokenizer.from_pretrained( opt.pretrained_bert_name) transformer = XLNetModel.from_pretrained(opt.pretrained_bert_name, output_attentions=True) if 'bert' or 'xlnet' in opt.model_name: tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) self.model = opt.model_class(transformer, opt).to(opt.device) # elif 'xlnet' in opt.model_name: # tokenizer = Tokenizer4Pretrain(tokenizer, opt.max_seq_len) # self.model = opt.model_class(bert,opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, args, dictionary, embed_tokens, left_pad=True): super().__init__(dictionary) self.dropout = args.dropout self.n_gpu = torch.cuda.device_count() print('Distributed rank: ', args.distributed_rank) print('Number of used GPU: ', self.n_gpu) # if self.n_gpu > 1: # torch.distributed.barrier() if args.distributed_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Load pre-trained model (weights) config = XLNetConfig.from_pretrained(args.xlnet_model) self.xlnet = XLNetModel.from_pretrained(args.xlnet_model, config=config) if args.distributed_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, left_pad=left_pad, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, config): super(XLNet_SenAnalysis, self).__init__(config) self.num_labels = config.num_labels self.dropout = nn.Dropout(args.lstm_dropout) self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) self.logits_proj = nn.Linear(args.lstm_hidden_size * 2, config.num_labels) self.W = [] self.gru = [] for i in range(args.lstm_layers): self.W.append( nn.Linear(args.lstm_hidden_size * 2, args.lstm_hidden_size * 2)) self.gru.append(nn.GRU(config.hidden_size if i ==0 else args.lstm_hidden_size*4,\ args.lstm_hidden_size, num_layers=1, bidirectional = True,\ batch_first = True).cuda()) self.W = nn.ModuleList(self.W) self.gru = nn.ModuleList(self.gru) self.init_weights()
def __init__(self, args, device, checkpoint): super(ExtSummarizer, self).__init__() self.args = args self.device = device self.bert = XLNet(args.large, args.temp_dir, args.finetune_bert) self.ext_layer = ExtTransformerEncoder( self.bert.model.config.hidden_size, args.ext_ff_size, args.ext_heads, args.ext_dropout, args.ext_layers) if (args.encoder == 'baseline'): bert_config = XLNetConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size, num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size) self.bert.model = XLNetModel(bert_config) self.ext_layer = Classifier(self.bert.model.config.hidden_size) if (args.max_pos > 512): my_pos_embeddings = nn.Embedding( args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.bert.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: if args.param_init != 0.0: for p in self.ext_layer.parameters(): p.data.uniform_(-args.param_init, args.param_init) if args.param_init_glorot: for p in self.ext_layer.parameters(): if p.dim() > 1: xavier_uniform_(p) self.to(device)
def __init__(self, num_labels=2, model_type='xlnet-base-cased',token_layer='token-cls',output_logits=True): super(XLNetForWSD, self).__init__() self.config = XLNetConfig() self.token_layer = token_layer self.num_labels = 2 self.xlnet = XLNetModel.from_pretrained(model_type) self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.output_logits = output_logits # Define which token selection layer to use if token_layer == 'token-cls': self.tokenselectlayer = TokenClsLayer() elif token_layer in ['sent-cls','sent-cls-ws']: self.tokenselectlayer = SentClsLayer() else: raise ValueError("Unidentified parameter for token selection layer") self.classifier = nn.Linear(768, num_labels) if not output_logits: self.softmax = nn.Softmax(dim=1) # to be checked!!! nn.init.xavier_normal_(self.classifier.weight)
def test_xlnet_embeddings(): xlnet_model = 'xlnet-large-cased' tokenizer = XLNetTokenizer.from_pretrained(xlnet_model) model = XLNetModel.from_pretrained( pretrained_model_name_or_path=xlnet_model, output_hidden_states=True) model.to(flair.device) model.eval() s = 'Berlin and Munich have a lot of puppeteer to see .' with torch.no_grad(): tokens = tokenizer.tokenize((('<s>' + s) + '</s>')) print(tokens) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[(-1)] first_layer = hidden_states[1][0] assert (len(first_layer) == len(tokens)) def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = XLNetEmbeddings(pretrained_model_name_or_path=xlnet_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence_first_subword = embed_sentence(sentence=s, pooling_operation='first') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) sentence_last_subword = embed_sentence(sentence=s, pooling_operation='last') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[9].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation='first_last') first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[9]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) sentence_mean_subword = embed_sentence(sentence=s, pooling_operation='mean') first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) sentence_mult_layers = embed_sentence(sentence='Munich', pooling_operation='first', layers='1,2,3,4') ref_embedding_size = (4 * model.d_model) actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size) sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin', pooling_operation='first', layers='1,2,3,4', use_scalar_mix=True) ref_embedding_size = (1 * model.d_model) actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size)
def test_xlnet_embeddings(): xlnet_model: str = "xlnet-large-cased" tokenizer = XLNetTokenizer.from_pretrained(xlnet_model) model = XLNetModel.from_pretrained( pretrained_model_name_or_path=xlnet_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") print(tokens) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # '<s>', '▁Berlin', '▁and', '▁Munich', '▁have', '▁a', '▁lot', '▁of', '▁puppet', 'eer', '▁to', '▁see', '▁', '.', '</s>' # | | | | | | | \ / | | \ / # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLNetEmbeddings( model=xlnet_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[9].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[9]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.d_model actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.d_model actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
''' @Time : 2019-06-16 11:34:23 @Author : su.zhu @Desc : ''' import torch from pytorch_transformers import XLNetModel, XLNetTokenizer # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetModel.from_pretrained('xlnet-base-cased') input_ids = torch.tensor([tokenizer.encode("Here is some text to encode")]) last_hidden_states = model(input_ids)[0] print(model.config) print(last_hidden_states.size()) # Tokenized input text_a = "Who was Jim Henson ?" text_b = "Jim Henson was a puppeteer" tokens_a = tokenizer.tokenize(text_a) tokens_b = tokenizer.tokenize(text_b) cls_token = '[CLS]' sep_token = '[SEP]' tokens = tokens_a + ['[SEP]'] segment_ids = [0] * len(tokens)
def __init__(self, args, device, checkpoint=None, bert_from_extractive=None): super(AbsSummarizer, self).__init__() self.args = args self.device = device self.bert = XLNet(args.large, args.temp_dir, args.finetune_bert) if bert_from_extractive is not None: self.bert.model.load_state_dict(dict([ (n[11:], p) for n, p in bert_from_extractive.items() if n.startswith('bert.model') ]), strict=True) if (args.encoder == 'baseline'): bert_config = XLNetConfig( self.bert.model.config.vocab_size, hidden_size=args.enc_hidden_size, num_hidden_layers=args.enc_layers, num_attention_heads=8, intermediate_size=args.enc_ff_size, hidden_dropout_prob=args.enc_dropout, attention_probs_dropout_prob=args.enc_dropout) self.bert.model = XLNetModel(bert_config) if (args.max_pos > 512): my_pos_embeddings = nn.Embedding( args.max_pos, self.bert.model.config.hidden_size) my_pos_embeddings.weight.data[: 512] = self.bert.model.embeddings.position_embeddings.weight.data my_pos_embeddings.weight.data[ 512:] = self.bert.model.embeddings.position_embeddings.weight.data[ -1][None, :].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings self.vocab_size = self.bert.model.config.vocab_size tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) if (self.args.share_emb): tgt_embeddings.weight = copy.deepcopy( self.bert.model.embeddings.word_embeddings.weight) self.decoder = TransformerDecoder(self.args.dec_layers, self.args.dec_hidden_size, heads=self.args.dec_heads, d_ff=self.args.dec_ff_size, dropout=self.args.dec_dropout, embeddings=tgt_embeddings) self.generator = get_generator(self.vocab_size, self.args.dec_hidden_size, device) self.generator[0].weight = self.decoder.embeddings.weight if checkpoint is not None: self.load_state_dict(checkpoint['model'], strict=True) else: for module in self.decoder.modules(): if isinstance(module, (nn.Linear, nn.Embedding)): module.weight.data.normal_(mean=0.0, std=0.02) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() for p in self.generator.parameters(): if p.dim() > 1: xavier_uniform_(p) else: p.data.zero_() if (args.use_bert_emb): tgt_embeddings = nn.Embedding( self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) tgt_embeddings.weight = copy.deepcopy( self.bert.model.word_embedding.weight) self.decoder.embeddings = tgt_embeddings self.generator[0].weight = self.decoder.embeddings.weight self.to(device)
def test_model_from_pretrained(self): cache_dir = "/tmp/pytorch_transformers_test/" for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)