def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels): model = TransfoXLModel(config) model.eval() hidden_states_1, mems_1 = model(input_ids_1) hidden_states_2, mems_2 = model(input_ids_2, mems_1) outputs = { "hidden_states_1": hidden_states_1, "mems_1": mems_1, "hidden_states_2": hidden_states_2, "mems_2": mems_2, } return outputs
def test_model_from_pretrained(self): cache_dir = "/tmp/pytorch_pretrained_bert_test/" for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def construct_encoder(self): model = TransfoXLModel.from_pretrained(self.model_name) model.cuda() model = torch.nn.DataParallel(model) model.eval() tokenizer = TransfoXLTokenizer.from_pretrained(self.model_name) print("Model and tokenzier are constructed!") return model, tokenizer
def __init__(self, name, **kwargs): super(TXLEmbeddings, self).__init__(name=name, **kwargs) global TXL_TOKENIZER self.dsz = kwargs.get('dsz') if TXL_TOKENIZER is None: TXL_TOKENIZER = TransfoXLTokenizer.from_pretrained(kwargs.get('embed_file')) self.model = TransfoXLModel.from_pretrained(kwargs.get('embed_file')) self.vocab = TXL_TOKENIZER.sym2idx self.vsz = len(TXL_TOKENIZER.sym2idx)
def get_xl_layer_representations(seq_len, text_array, remove_chars, word_ind_to_extract): model = TransfoXLModel.from_pretrained('transfo-xl-wt103') tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model.eval() # get the token embeddings token_embeddings = [] for word in text_array: current_token_embedding = get_xl_token_embeddings([word], tokenizer, model, remove_chars) token_embeddings.append( np.mean(current_token_embedding.detach().numpy(), 1)) # where to store layer-wise xl embeddings of particular length XL = {} for layer in range(19): XL[layer] = [] XL[-1] = token_embeddings if word_ind_to_extract < 0: # the index is specified from the end of the array, so invert the index from_start_word_ind_to_extract = seq_len + word_ind_to_extract else: from_start_word_ind_to_extract = word_ind_to_extract start_time = tm.time() # before we've seen enough words to make up the sequence length, add the representation for the last word 'seq_len' times word_seq = text_array[:seq_len] for _ in range(seq_len): XL = add_avrg_token_embedding_for_specific_word( word_seq, tokenizer, model, remove_chars, from_start_word_ind_to_extract, XL) # then add the embedding of the last word in a sequence as the embedding for the sequence for end_curr_seq in range(seq_len, len(text_array)): word_seq = text_array[end_curr_seq - seq_len + 1:end_curr_seq + 1] XL = add_avrg_token_embedding_for_specific_word( word_seq, tokenizer, model, remove_chars, from_start_word_ind_to_extract, XL) if end_curr_seq % 100 == 0: print('Completed {} out of {}: {}'.format(end_curr_seq, len(text_array), tm.time() - start_time)) start_time = tm.time() print('Done extracting sequences of length {}'.format(seq_len)) return XL
def get_hidden(tokens_tensor): # Load pre-trained model (weights) model = TransfoXLModel.from_pretrained('transfo-xl-wt103') model.eval() # If you have a GPU, put everything on cuda if torch.cuda.is_available(): tokens_tensor = tokens_tensor.to('cuda') model.to('cuda') with torch.no_grad(): # Predict hidden states features for each layer hidden_states, mems = model(tokens_tensor) # We can re-use the memory cells in a subsequent call to attend a longer context #hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1) return hidden_states
text_1 = "Who was Jim Henson ?" text_2 = "Jim Henson was a puppeteer" tokenized_text_1 = tokenizer.tokenize(text_1); print(tokenized_text_1) # ['Who', 'was', 'Jim', 'Henson', '?'] print(tokenizer.tokenize("who was jim henson ?")) tokenized_text_2 = tokenizer.tokenize(text_2) indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1); print(indexed_tokens_1) # [2517, 11, 1666, 12034, 788] print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("who was jim henson ?"))) # [52, 11, 24, 24, 788]; 也是 case sensitive indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2) tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) ################################################################## ## TransfoXLModel model = TransfoXLModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/transfo-xl-wt103') model.eval() with torch.no_grad(): hidden_states_1, mems_1 = model(tokens_tensor_1) # Predict hidden states features for each layer hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # We can re-use the memory cells in a subsequent call to attend a longer context ################################################################## ## TransfoXLLMHeadModel model = TransfoXLLMHeadModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/transfo-xl-wt103/') model.eval() with torch.no_grad(): predictions_1, mems_1 = model(tokens_tensor_1) # Predict all tokens predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1) # We can re-use the memory cells in a subsequent call to attend a longer context
text_1 = "Who was Jim Henson ?" text_2 = "Jim Henson was a puppeteer" tokenized_text_1 = tokenizer.tokenize(text_1) tokenized_text_2 = tokenizer.tokenize(text_2) # Convert token to vocabulary indices indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1) indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2) # Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) print('tokenized_text_1', tokenized_text_1) print('tokens_tensor_1', tokens_tensor_1) # Load pre-trained model (weights) model = TransfoXLModel.from_pretrained('transfo-xl-wt103') model.eval() # If you have a GPU, put everything on cuda if torch.cuda.is_available(): tokens_tensor_1 = tokens_tensor_1.to('cuda') tokens_tensor_2 = tokens_tensor_2.to('cuda') model.to('cuda') with torch.no_grad(): # Predict hidden states features for each layer hidden_states_1, mems_1 = model(tokens_tensor_1) print(hidden_states_1, mems_1) # We can re-use the memory cells in a subsequent call to attend a longer context hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1) print(hidden_states_2, mems_2)
def __init__(self, n_layers, in_size, out_size, embed_size, dropout=0.5, initialEmbW=None, rnn_type='lstm', attention=None, q_size=-1, embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int) : Dimensionality of hidden vectors to be output. embed_size (int): Dimensionality of word embedding. dropout (float): Dropout ratio. """ # TODO conv_out_size = 512 super(LSTMEncoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) self.elmo_init = elmo_init self.bert_init = bert_init self.bert_model = bert_model self.add_word_emb = add_word_emb if elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) if rnn_type == 'lstm': self.lstm = nn.LSTM(embed_size, out_size, n_layers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.lstm = nn.GRU(embed_size, out_size, n_layers, batch_first=True, dropout=dropout) self.attention = attention if attention == 'conv' or attention == 'conv_sum': conv_in_size = out_size self.conv1 = nn.Conv1d(in_channels=conv_in_size, out_channels=conv_out_size, kernel_size=1, padding=0) self.conv2 = nn.Conv1d(in_channels=conv_out_size, out_channels=2, kernel_size=1, padding=0) if weights_init is not None: self.conv1.weight.data.copy_( torch.from_numpy(weights_init['conv1'])) self.conv2.weight.data.copy_( torch.from_numpy(weights_init['conv2'])) elif attention == 'c_conv_sum': hidden_size = 512 conv_hidden_size = 256 layers = [ weight_norm(nn.Linear(out_size, hidden_size), dim=None), nn.ReLU() ] self.c_fa = nn.Sequential(*layers) layers = [ weight_norm(nn.Linear(q_size, hidden_size), dim=None), nn.ReLU() ] self.q_fa = nn.Sequential(*layers) layers = [ nn.Conv2d(in_channels=hidden_size, out_channels=conv_hidden_size, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=conv_hidden_size, out_channels=1, kernel_size=1) ] self.cq_att = nn.Sequential(*layers) if weights_init is not None: self.c_fa[0].weight.data.copy_( torch.from_numpy(weights_init['c_fa'])) self.q_fa[0].weight.data.copy_( torch.from_numpy(weights_init['q_fa'])) self.cq_att[0].weight.data.copy_( torch.from_numpy(weights_init['cq_att_conv1'])) self.cq_att[2].weight.data.copy_( torch.from_numpy(weights_init['cq_att_conv2']))
output = output.reshape(output.shape[0], -1, output.shape[1]) print(output.shape) output = np.swapaxes(output, 0, 1) list_output.append(output) # ====== Construct Cache ====== # temp_cache = {} for i, sent in enumerate(mini_batch): hask_key = hashlib.sha256(sent.encode()).hexdigest() temp_cache[hask_key] = output[i] self.cache.update(temp_cache) idx += mini_batch_size self.count += mini_batch_size output = np.concatenate(list_output, 0) te = time.time() print('encoding with model', len(sentences), 'processed', self.count, 'took', '{:4.1f}'.format(te - ts)) te = time.time() embedding = self.get_multi_head_embedding(output, heads, head_size) return embedding if __name__ == '__main__': model = TransfoXLModel('bert-base-uncased') model.prepare('Length') model.construct_encoder()
def __init__(self, args): super(QAxl, self).__init__() hidden_size = args['hidden_size'] dropout = args['dropout'] attention_size = args['attention_size'] word_emb = np.array(read_json(args['data_dir'] + 'word_emb.json'), dtype=np.float32) word_size = word_emb.shape[0] word_dim = word_emb.shape[1] char_dim = args['char_dim'] char_len = len(read_json(args['data_dir'] + 'char2id.json')) pos_dim = args['pos_dim'] ner_dim = args['ner_dim'] self.args = args self.train_loss = AverageMeter() self.use_cuda = args['use_cuda'] self.use_xl = args['use_xl'] if self.use_xl: self.xl = TransfoXLModel.from_pretrained('transfo-xl-wt103') xl_dim = 1024 ## Embedding Layer print('Building embedding...') self.word_embeddings = nn.Embedding(word_emb.shape[0], word_dim, padding_idx=0) self.word_embeddings.weight.data = torch.from_numpy(word_emb) self.char_embeddings = nn.Embedding(char_len, char_dim, padding_idx=0) self.pos_embeddings = nn.Embedding(args['pos_size'], args['pos_dim'], padding_idx=0) self.ner_embeddings = nn.Embedding(args['ner_size'], args['ner_dim'], padding_idx=0) with open(args['data_dir'] + 'tune_word_idx.pkl', 'rb') as f: tune_idx = pkl.load(f) self.fixed_idx = list( set([i for i in range(word_size)]) - set(tune_idx)) fixed_embedding = torch.from_numpy(word_emb)[self.fixed_idx] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding low_p_dim = word_dim + word_dim + args['pos_dim'] + args['ner_dim'] + 4 low_q_dim = word_dim + args['pos_dim'] + args['ner_dim'] if self.use_xl: low_p_dim += xl_dim low_q_dim += xl_dim self.emb_char = Embedding(word_dim, char_dim, hidden_size) ## Forward Layers Declaration high_p_dim = 2 * hidden_size full_q_dim = 2 * high_p_dim attention_dim = word_dim + full_q_dim if self.use_xl: attention_dim += xl_dim self.word_attention_layer = WordAttention(word_dim, attention_size, dropout) self.low_rnn = StackedPaddedRNN(low_p_dim, hidden_size, 1, dropout=dropout) self.high_rnn = StackedPaddedRNN(high_p_dim, hidden_size, 1, dropout=dropout) self.full_rnn = StackedPaddedRNN(full_q_dim, hidden_size, 1, dropout=dropout) self.low_attention_layer = MultiAttention(attention_dim, attention_size, dropout) self.high_attention_layer = MultiAttention(attention_dim, attention_size, dropout) self.full_attention_layer = MultiAttention(attention_dim, attention_size, dropout) ## Fusion Layer and Final Attention + Final RNN fuse_dim = 10 * hidden_size self_attention_dim = 12 * hidden_size + word_dim + ner_dim + pos_dim + 1 if self.use_xl: self_attention_dim += xl_dim self.fuse_rnn = StackedPaddedRNN(fuse_dim, hidden_size, 1, dropout=dropout) self.self_attention_layer = MultiAttention(self_attention_dim, attention_size, dropout) self.self_rnn = StackedPaddedRNN(4 * hidden_size, hidden_size, 1, dropout=dropout) ## Verifier and output self.summ_layer = PointerS(2 * hidden_size, dropout=dropout, use_cuda=self.use_cuda) self.summ_layer2 = PointerS(2 * hidden_size, dropout=dropout, use_cuda=self.use_cuda) self.pointer_layer = PointerNet(2 * hidden_size, use_cuda=self.use_cuda) self.has_ans = nn.Sequential(nn.Dropout(p=dropout), nn.Linear(6 * hidden_size, 2))
def tokens2ids(text): # context tokenized_text = [tokenizer.tokenize(sent) for sent in text] tokenized_text = paddingText(tokenized_text) # Convert token to vocabulary indices indexed_tokens = [ tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text ] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor(indexed_tokens) return tokens_tensor model = TransfoXLModel.from_pretrained('./pretrained_model') model.eval() if torch.cuda.is_available(): model = model.cuda() def ids2embeddings(ids): if torch.cuda.is_available(): ids = ids.cuda() with torch.no_grad(): hidden_state, mems = model(ids) return hidden_state def genBatch(mode='train', bsz=2, ismasked=True): if mode == 'train':
def __init__(self, n_layers, in_size, out_size, embed_size, in_size_hier, hidden_size, proj_size, dropout=0.5, initialEmbW=None, independent=False, rnn_type='lstm', classifier='baseline', states_att=False, state_size=-1, embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True, pretrained_all=True): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int): Dimensionality of output vectors. embed_size (int): Dimensionality of word embedding. hidden_size (int) : Dimensionality of hidden vectors. proj_size (int) : Dimensionality of projection before softmax. dropout (float): Dropout ratio. """ #TODO att_size = 128 self.rnn_type = rnn_type self.classifier = classifier super(HLSTMDecoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) if rnn_type == 'lstm': self.lstm = nn.LSTM(embed_size + in_size_hier, hidden_size, n_layers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.lstm = nn.GRU(embed_size + in_size_hier, hidden_size, n_layers, batch_first=True, dropout=dropout) if weights_init is not None: lstm_wt = weights_init['lstm'] for k, v in lstm_wt.items(): self.lstm.__getattr__(k).data.copy_(torch.from_numpy(v)) self.elmo_init = elmo_init self.bert_init = bert_init self.pretrained_all = pretrained_all self.bert_model = bert_model self.add_word_emb = add_word_emb if False: #if pretrained_all and elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif False: #elif pretrained_all and bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) self.n_layers = n_layers self.dropout = dropout self.independent = independent self.states_att = states_att if states_att: self.ecW = nn.Linear(state_size, att_size) self.ysW = nn.Linear(hidden_size, att_size) hidden_size += state_size if classifier == 'baseline': layers = [ nn.Linear(hidden_size, proj_size), nn.Linear(proj_size, out_size) ] self.y_classifier = nn.Sequential(*layers) elif classifier == 'weighted_norm': layers = [ weight_norm(nn.Linear(hidden_size, proj_size), dim=None), nn.ReLU(), weight_norm(nn.Linear(proj_size, out_size), dim=None) ] self.y_classifier = nn.Sequential(*layers) elif classifier == 'logit': layers = [ weight_norm(nn.Linear(hidden_size, proj_size), dim=None), nn.ReLU(), nn.Linear(proj_size, out_size) ] self.classifier_txt = nn.Sequential(*layers) layers = [ weight_norm(nn.Linear(hidden_size, 2048), dim=None), nn.ReLU(), nn.Linear(2048, out_size) ] self.classifier_ft = nn.Sequential(*layers) if weights_init is not None: self.classifier_txt[0].weight.data.copy_( torch.from_numpy(weights_init['classifier_txt'])) self.classifier_ft[0].weight.data.copy_( torch.from_numpy(weights_init['classifier_ft']))
def Get_Transformer_Representation(self, examples_train, examples_test): train_rep_file = "./data/" + pb.dataset + "_train_" + "transformerXL" test_rep_file = "./data/" + pb.dataset + "_test_" + "transformerXL" if (os.path.exists(train_rep_file) == True and os.path.exists(test_rep_file) == True): with open(train_rep_file, 'rb') as file: examples_train_rep = pickle.load(file) for i, example in enumerate(examples_train): example.transformerXL_mat = examples_train_rep[i] with open(test_rep_file, 'rb') as file: examples_test_rep = pickle.load(file) for i, example in enumerate(examples_test): example.transformerXL_mat = examples_test_rep[i] else: examples = [] for example in examples_train: examples.append(example) for example in examples_test: examples.append(example) for i, example in enumerate(examples): # example.transformerXL_mat = np.zeros((pb.fgt_maxlength,20)) # continue if (self.transformer_tokenizer == None): self.transformer_tokenizer = TransfoXLTokenizer.from_pretrained( 'transfo-xl-wt103') text = example.fgt_channels[0] tokenized_text = self.transformer_tokenizer.tokenize(text) indexed_tokens = self.transformer_tokenizer.convert_tokens_to_ids( tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) if (self.transformer == None): self.transformer = TransfoXLModel.from_pretrained( 'transfo-xl-wt103') self.transformer.eval() with torch.no_grad(): hidden_states, _ = self.transformer( tokens_tensor) # (1, 3, 1024) shape = np.array(hidden_states).shape # print(shape) representation, sum = [], 0 a, b = shape[1], shape[2] representation = np.zeros((a, b)) for layer in hidden_states: for words in layer.numpy(): representation += words sum += 1 if (sum > 0): representation = representation * 1.0 / sum representation = list(representation) while (len(representation) < pb.fgt_maxlength): representation.append(np.zeros(b)) example.transformerXL_mat = representation[0:pb. fgt_maxlength] print("{:.2%}".format(i * 1.0 / len(examples)))
def __init__(self, n_wlayers, n_slayers, in_size, out_size, embed_size, hidden_size, dropout=0.5, ignore_label=None, initialEmbW=None, independent=False, rnn_type='lstm', embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True, pretrained_all=True, concat_his=False): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int) : Dimensionality of hidden vectors to be output. embed_size (int): Dimensionality of word embedding. dropout (float): Dropout ratio. """ super(HLSTMEncoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) if rnn_type == 'lstm': self.wlstm = nn.LSTM(embed_size, hidden_size, n_wlayers, batch_first=True, dropout=dropout) self.slstm = nn.LSTM(hidden_size, out_size, n_slayers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.wlstm = nn.GRU(embed_size, hidden_size, n_wlayers, batch_first=True, dropout=dropout) self.slstm = nn.GRU(hidden_size, out_size, n_slayers, batch_first=True, dropout=dropout) self.elmo_init = elmo_init self.bert_init = bert_init self.pretrained_all = pretrained_all self.concat_his = concat_his self.bert_model = bert_model self.add_word_emb = add_word_emb if pretrained_all and elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif pretrained_all and bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) self.independent = independent self.rnn_type = rnn_type