def __init__( self, config, output_attentions=False, keep_multihead_output=False, n_layers=2, activation='relu', beta=100, ): super(BertForCoQA, self).__init__(config) self.output_attentions = output_attentions self.bert = BertModel(config) hidden_size = config.hidden_size self.rational_l = Multi_linear_layer(n_layers, hidden_size, hidden_size, 1, activation) self.logits_l = Multi_linear_layer(n_layers, hidden_size, hidden_size, 2, activation) self.unk_l = Multi_linear_layer(n_layers, hidden_size, hidden_size, 1, activation) self.attention_l = Multi_linear_layer(n_layers, hidden_size, hidden_size, 1, activation) self.yn_l = Multi_linear_layer(n_layers, hidden_size, hidden_size, 2, activation) self.beta = beta self.init_weights()
def __init__(self, config): super(BertSentClassifier, self).__init__() self.num_labels = config.num_labels self.bert = BertModel.from_pretrained('bert-base-uncased') self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained('bert_base/') if args.bert_freeze: for param in self.bert.parameters(): param.requires_grad = False self.dropout = nn.Dropout(args.bert_dropout) self.linear = nn.Linear(args.bert_hidden_size, len(labels), bias=True)
def __init__(self, config, pretrained_weights): super(PretrainedBert, self).__init__() self.num_labels = config.num_labels self.bert = BertModel.from_pretrained('bert-base-uncased') self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels) self.classifier.weight = torch.nn.Parameter(pretrained_weights['weights']) self.classifier.bias = torch.nn.Parameter(pretrained_weights['bias'])
def train(**kwargs): train_dataset = ClassifierDataset(kwargs["--train_path"]) valid_dataset = ClassifierDataset(kwargs["--valid_path"]) print("Dataset loaded successfully") train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True) valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True) model = BertModel() optimizer = optim.Adam(model.parameters(), lr) BertModel.trainer(model, optimizer, EPOCH)
def __init__(self): super().__init__() self.bert = BertModel.from_pretrained('bert_large/') if args.bert_freeze: for param in self.bert.parameters(): param.requires_grad = False self.context_dropout = nn.Dropout(args.context_dropout) self.mention_dropout = nn.Dropout(args.mention_dropout) self.layer_norm = nn.LayerNorm(args.bert_hidden_size) self.multi_head_atten = MultiHeadAttention(args.bert_hidden_size, num_heads=8, dropout=0.1) self.mention_char_atten = MultiHeadAttention(args.bert_hidden_size, num_heads=8, dropout=0.1) self.context_lstm = BiLSTM(input_size=args.bert_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=args.rnn_num_layers, dropout=args.rnn_dropout, num_dirs=args.rnn_num_dirs) self.mention_lstm = BiLSTM(input_size=args.bert_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=args.rnn_num_layers, dropout=args.rnn_dropout, num_dirs=args.rnn_num_dirs) self.context_attn_sum = SelfAttentiveSum(args.bert_hidden_size, 100) self.mention_attn_sum = SelfAttentiveSum(args.bert_hidden_size, 1) self.char_cnn = CharCNN(embedding_num=len(char_vocab), embedding_dim=args.cnn_embedding_dim, filters=eval(args.cnn_filters), output_dim=args.cnn_output_dim) self.linear = nn.Linear(in_features=2 * args.bert_hidden_size + args.cnn_output_dim, out_features=len(labels), bias=True) if args.interaction: self.mention_linear = nn.Linear(in_features=args.bert_hidden_size + args.cnn_output_dim, out_features=args.bert_hidden_size, bias=True) self.affinity_matrix = nn.Linear(args.bert_hidden_size, args.bert_hidden_size) self.fusion = Fusion(args.bert_hidden_size) self.normalize = Normalize() self.fusion_linear = nn.Linear(in_features=2 * args.bert_hidden_size, out_features=len(labels), bias=True)
def create_model(args, pyreader_name, bert_config, num_labels, paradigm_inst, is_prediction=False): """create dialogue task model""" if args.task_name == 'atis_slot': label_dim = [-1, args.max_seq_len] lod_level = 1 elif args.task_name in ['dstc2', 'dstc2_asr', 'multi-woz']: label_dim = [-1, num_labels] lod_level = 0 else: label_dim = [-1, 1] lod_level = 0 pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], label_dim], dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], lod_levels=[0, 0, 0, 0, lod_level], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, labels) = fluid.layers.read_file(pyreader) bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) params = { 'num_labels': num_labels, 'src_ids': src_ids, 'pos_ids': pos_ids, 'sent_ids': sent_ids, 'input_mask': input_mask, 'labels': labels, 'is_prediction': is_prediction } if is_prediction: results = paradigm_inst.paradigm(bert, params) results['pyreader'] = pyreader return results results = paradigm_inst.paradigm(bert, params) results['pyreader'] = pyreader return results
def load_model(model_path, num_hidden_layers=None): ckpt_reader = tf.train.load_checkpoint( os.path.join(model_path, 'bert_model.ckpt')) config = json.load(open(os.path.join(model_path, 'bert_config.json'))) loaded_params = {k: config[k] for k in params.keys()} # import pdb; pdb.set_trace() if num_hidden_layers is not None and num_hidden_layers > 0: loaded_params['num_hidden_layers'] = num_hidden_layers tfbert = BertModel(**loaded_params) tfbert([tf.constant([[1]]), tf.constant([[1]]), tf.constant([[1]])]) tfbert_weights = {w.name: w for w in tfbert.weights} official_weights = set(ckpt_reader.get_variable_to_dtype_map().keys()) skip_tensor = [ 'cls/predictions/transform/dense/kernel', 'cls/seq_relationship/output_weights', 'cls/predictions/transform/LayerNorm/beta', 'cls/predictions/output_bias', 'cls/predictions/transform/LayerNorm/gamma', 'cls/seq_relationship/output_bias', 'cls/predictions/transform/dense/bias', ] good = True for x in official_weights - set( [x.split(':')[0] for x in tfbert_weights.keys()]): if 'adam' not in x and 'global_step' not in x: if x not in skip_tensor: print('diff offi', x) good = False for x in set([x.split(':')[0] for x in tfbert_weights.keys()]) - official_weights: if 'adam' not in x and 'global_step' not in x: print('diff ours', x) good = False assert good weight_tuples = [] for k, v in tfbert_weights.items(): name = k[:-2] if ckpt_reader.has_tensor(name): ckpt_value = ckpt_reader.get_tensor(name) weight_tuples.append((v, ckpt_value)) assert v.shape == ckpt_value.shape, \ f'{name} shape invalid {v.shape}, {ckpt_value.shape}' else: print(f'{name} weight not loaded') tf.keras.backend.batch_set_value(weight_tuples) return tfbert
def __init__(self, config, head_dropout=None): super(BertForQuestRegression, self).__init__(config) self.config = config self.num_labels = config.num_labels if head_dropout is None: head_dropout = config.hidden_dropout_prob self.bert = BertModel(config) self.dropout = nn.Dropout(head_dropout) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights()
def __init__(self, config, num_tag, use_cuda): super(KBQA, self).__init__(config) # BERT self.bert = BertModel(config) # NER self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, num_tag) self.crf = CRF(num_tag, use_cuda) # relationship self.re_layer = nn.Linear(config.hidden_size, 1) # yes/no self.apply(self.init_bert_weights)
def __init__(self, opt): super(BertMapping, self).__init__() bert_config = BertConfig.from_json_file(opt.bert_config_file) self.bert = BertModel(bert_config) self.bert.load_state_dict( torch.load(opt.init_checkpoint, map_location='cpu')) freeze_layers(self.bert) self.txt_stru = opt.txt_stru if opt.txt_stru == 'pooling': self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(bert_config.hidden_size, opt.final_dims) elif opt.txt_stru == 'cnn': Ks = [1, 2, 3] in_channel = 1 out_channel = 512 embedding_dim = bert_config.hidden_size self.convs1 = nn.ModuleList([ nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in Ks ]) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(len(Ks) * out_channel, opt.final_dims) elif opt.txt_stru == 'rnn': embedding_dim = bert_config.hidden_size self.bi_gru = opt.bi_gru self.rnn = nn.GRU(embedding_dim, opt.embed_size, opt.num_layers, batch_first=True, bidirectional=opt.bi_gru) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(opt.embed_size, opt.final_dims) elif opt.txt_stru == 'trans': bert_config = BertConfig.from_json_file(opt.img_trans_cfg) self.layer = bert.BERTLayer(bert_config) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(768, opt.final_dims)
def __init__(self, num_labels, bret_pretrainded_path): """ 在定义task model 的时候必须包含的两部分: self.train_state 和 self.device 如果不包含着两部分将task model放入Training中进行训练的时候就会报错 另外模型还要包含两部分,一部分是模型的结构部分,一部分是loss function :param num_labels: :param bret_pretrainded_path: """ # 初始化 super().__init__() # 构建深度学习的网络结构 self.bert = BertModel.from_pretrained(bret_pretrainded_path) self.fc = nn.Linear(768, num_labels)
def __init__(self, config): super(BertSentClassifier, self).__init__() self.num_labels = config.num_labels self.bert = BertModel.from_pretrained('bert-base-uncased') # pretrain mode does not require updating bert paramters. for param in self.bert.parameters(): if config.option == 'pretrain': param.requires_grad = False elif config.option == 'finetune': param.requires_grad = True # todo raise NotImplementedError
def __init__(self, config): config.output_hidden_states = True super(CustomBert, self).__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(p=0.2) self.high_dropout = nn.Dropout(p=0.5) n_weights = config.num_hidden_layers + 1 weights_init = torch.zeros(n_weights).float() weights_init.data[:-1] = -3 self.layer_weights = torch.nn.Parameter(weights_init) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.init_weights()
def __init__(self, config, phrase_size, metric, use_sparse): encoder = BertWrapper(BertModel(config)) sparse_layer = None if use_sparse: sparse_layer = SparseAttention(config, num_sparse_heads=1) super(BertPhraseModel, self).__init__(encoder, sparse_layer, phrase_size, metric) def init_weights(module): if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=config.initializer_range) elif isinstance(module, BERTLayerNorm): module.beta.data.normal_(mean=0.0, std=config.initializer_range) module.gamma.data.normal_(mean=0.0, std=config.initializer_range) if isinstance(module, nn.Linear): module.bias.data.zero_() self.apply(init_weights)
import torch.optim as optim import torchtext import sys from dataloader import get_chABSA_DataLoaders_and_TEXT from bert import BertTokenizer from bert import get_config, BertModel, BertForchABSA, set_learned_params train_dl, val_dl, TEXT, dataloaders_dict = get_chABSA_DataLoaders_and_TEXT( max_length=256, batch_size=32) # モデル設定のJOSNファイルをオブジェクト変数として読み込みます config = get_config(file_path="./weights/bert_config.json") # BERTモデルを作成します net_bert = BertModel(config) # BERTモデルに学習済みパラメータセットします net_bert = set_learned_params(net_bert, weights_path="./weights/pytorch_model.bin") # モデル構築 net = BertForchABSA(net_bert) # 訓練モードに設定 net.train() print('ネットワーク設定完了') # 勾配計算を最後のBertLayerモジュールと追加した分類アダプターのみ実行
class BertMapping(nn.Module): """ """ def __init__(self, opt): super(BertMapping, self).__init__() bert_config = BertConfig.from_json_file(opt.bert_config_file) self.bert = BertModel(bert_config) self.bert.load_state_dict( torch.load(opt.init_checkpoint, map_location='cpu')) freeze_layers(self.bert) self.txt_stru = opt.txt_stru if opt.txt_stru == 'pooling': self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(bert_config.hidden_size, opt.final_dims) elif opt.txt_stru == 'cnn': Ks = [1, 2, 3] in_channel = 1 out_channel = 512 embedding_dim = bert_config.hidden_size self.convs1 = nn.ModuleList([ nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in Ks ]) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(len(Ks) * out_channel, opt.final_dims) elif opt.txt_stru == 'rnn': embedding_dim = bert_config.hidden_size self.bi_gru = opt.bi_gru self.rnn = nn.GRU(embedding_dim, opt.embed_size, opt.num_layers, batch_first=True, bidirectional=opt.bi_gru) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(opt.embed_size, opt.final_dims) elif opt.txt_stru == 'trans': bert_config = BertConfig.from_json_file(opt.img_trans_cfg) self.layer = bert.BERTLayer(bert_config) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(768, opt.final_dims) def forward(self, input_ids, attention_mask, token_type_ids, lengths): all_encoder_layers, pooled_output = self.bert( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) if self.txt_stru == 'pooling': output = self.mapping(all_encoder_layers[-1]) output = torch.mean(output, 1) code = output elif self.txt_stru == 'cnn': x = all_encoder_layers[-1].unsqueeze( 1) # (batch_size, 1, token_num, embedding_dim) x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1 ] # [(batch_size, out_channel, W), ...]*len(Ks) x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks) output = torch.cat(x, 1) elif self.txt_stru == 'rnn': x = all_encoder_layers[ -1] # (batch_size, token_num, embedding_dim) packed = pack_padded_sequence(x, lengths, batch_first=True) # Forward propagate RNN out, _ = self.rnn(packed) # Reshape *final* output to (batch_size, hidden_size) padded = pad_packed_sequence(out, batch_first=True) cap_emb, cap_len = padded if self.bi_gru: cap_emb = (cap_emb[:, :, :cap_emb.size(2) / 2] + cap_emb[:, :, cap_emb.size(2) / 2:]) / 2 else: cap_emb = cap_emb output = torch.mean(cap_emb, 1) elif self.txt_stru == 'trans': hidden_states = self.mapping(all_encoder_layers[-1]) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.float() extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 hidden_states = self.layer(hidden_states, extended_attention_mask) # output = hidden_states[:, 0, :] output = torch.mean(hidden_states, 1) output = self.dropout(output) code = self.mapping(output) # code = F.tanh(code) code = F.normalize(code, p=2, dim=1) return code
import torch from bert import BertModel sanity_data = torch.load("./sanity_check.data") # text_batch = ["hello world", "hello neural network for NLP"] # tokenizer here sent_ids = torch.tensor([[101, 7592, 2088, 102, 0, 0, 0, 0], [101, 7592, 15756, 2897, 2005, 17953, 2361, 102]]) att_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]) # load our model bert = BertModel.from_pretrained('bert-base-uncased') outputs = bert(sent_ids, att_mask) for k in ['last_hidden_state', 'pooler_output']: assert torch.allclose(outputs[k], sanity_data[k], atol=1e-4, rtol=0)
def load_model(model_path, num_hidden_layers=None): ckpt_reader = tf.train.load_checkpoint( os.path.join(model_path, 'bert_model.ckpt')) config = json.load(open(os.path.join(model_path, 'bert_config.json'))) loaded_params = {k: config[k] for k in params.keys() if k in config} if 'embedding_size' in config: loaded_params['embedding_size'] = config['embedding_size'] # import pdb; pdb.set_trace() if num_hidden_layers is not None and num_hidden_layers > 0: loaded_params['num_hidden_layers'] = num_hidden_layers tfbert = BertModel(**loaded_params) tfbert([tf.constant([[1]]), tf.constant([[1]]), tf.constant([[1]])]) def convert_official_name(x): x = x.replace('electra/encoder', 'bert/encoder') x = x.replace('discriminator_predictions/dense/kernel', 'bert/pooler/dense/kernel') x = x.replace('discriminator_predictions/dense/bias', 'bert/pooler/dense/bias') x = x.replace('electra/embeddings_project/kernel', 'bert/encoder/embedding_hidden_mapping_in/kernel') x = x.replace('electra/embeddings_project/bias', 'bert/encoder/embedding_hidden_mapping_in/bias') x = x.replace('electra/embeddings', 'bert/embeddings') return x skip_tensor = [ 'discriminator_predictions/dense_1/kernel', 'discriminator_predictions/dense_1/bias', 'cls/seq_relationship/output_bias', 'cls/predictions/output_bias', 'cls/predictions/transform/dense/kernel', 'cls/predictions/transform/LayerNorm/beta', 'cls/predictions/transform/LayerNorm/gamma', 'cls/predictions/transform/dense/bias', 'cls/seq_relationship/output_weights', ] tfbert_weights = { w.name: w for w in tfbert.weights if 'generator' not in w.name } official_weights = { convert_official_name(k): ckpt_reader.get_tensor(k) for k in ckpt_reader.get_variable_to_dtype_map().keys() } good = True our_keys = set([x.split(':')[0] for x in tfbert_weights.keys()]) for x in set(official_weights.keys()) - our_keys: if 'adam' not in x and 'global_step' not in x and 'generator' not in x: if x not in skip_tensor: print('diff offi', x, official_weights[x].shape) good = False for x in our_keys - set(official_weights.keys()): if 'adam' not in x and 'global_step' not in x and 'generator' not in x: if x not in skip_tensor: print('diff ours', x, tfbert_weights[x + ':0'].shape) good = False assert good weight_tuples = [] for k, v in tfbert_weights.items(): name = k[:-2] if name in skip_tensor: continue else: off_tensor = None for ok in ckpt_reader.get_variable_to_dtype_map().keys(): if convert_official_name(ok) == name: off_tensor = ckpt_reader.get_tensor(ok) if off_tensor is not None: weight_tuples.append((v, off_tensor)) assert v.shape == off_tensor.shape, \ f'{name} shape invalid {v.shape}, {off_tensor.shape}' else: print(f'{name} weight not loaded') tf.keras.backend.batch_set_value(weight_tuples) return tfbert
def __init__(self, config): super(KBQA, self).__init__(config) self.bert = BertModel(config) self.ner_layer = nn.Linear(config.hidden_size, 2) # head, tail self.re_layer = nn.Linear(config.hidden_size, 1) # yes/no self.apply(self.init_bert_weights)
"""Test load official model, print difference.""" import tensorflow as tf import tensorflow_hub as hub from bert import BertModel, params tfbert = BertModel(**params) model_path = '../bert-embs/hub/chinese_L-12_H-768_A-12/' bert_layer = hub.KerasLayer(model_path, signature="tokens", signature_outputs_as_dict=True, trainable=False) assert len(tfbert.weights) == len(bert_layer.weights) weight_of_tfbert = set([x.name for x in tfbert.weights]) weight_of_official = set([x.name for x in bert_layer.weights]) fit_weight = len(weight_of_tfbert & weight_of_tfbert) assert fit_weight == len(bert_layer.weights) def get_name_values(model): names = [x.name for x in model.weights] values = tf.keras.backend.batch_get_value(model.weights) return dict(zip(names, values)) official_weights = get_name_values(bert_layer)