def create_model_for_multi_task(args, vocab_size, is_prediction=False): # 处理词典大小 if args['vocab_size'] > 0: vocab_size = args['vocab_size'] # 输入定义 qas_ids = fluid.data(name='qas_ids', dtype='int64', shape=[-1, 1]) src_ids = fluid.data(name='src_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) pos_ids = fluid.data(name='pos_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) sent_ids = fluid.data(name='sent_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) input_mask = fluid.data(name='input_mask', dtype='float32', shape=[-1, args['max_seq_length'], 1]) labels = fluid.data(name='labels', dtype='int64', shape=[-1, 1]) labels_for_reverse = fluid.data(name='labels_for_reverse', dtype='int64', shape=[-1, 1]) # 根据任务的不同调整所需的数据,预测任务相比训练任务缺少label这一项数据 if is_prediction: feed_list = [qas_ids, src_ids, pos_ids, sent_ids, input_mask] else: feed_list = [ qas_ids, src_ids, pos_ids, sent_ids, input_mask, labels, labels_for_reverse ] reader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=64, iterable=True) # 模型部分 # 由bert后接一层全连接完成预测任务 # bert部分 config = args config['vocab_size'] = vocab_size bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=config, use_fp16=False, is_prediction=is_prediction) mrc_layer = config['mrc_layer'] freeze_pretrained_model = config['freeze_pretrained_model'] cls_feats, reverse_feats = bert.get_pooled_outputs() bert_encode = bert.get_sequence_output() if freeze_pretrained_model: cls_feats.stop_gradient = True bert_encode.stop_gradient = True logits = None if mrc_layer == "cls_fc": # 取[CLS]的输出经全连接进行预测 cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train", is_test=is_prediction) logits = fluid.layers.fc( input=cls_feats, size=args['num_labels'], param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) logits_for_reverse = fluid.layers.fc( input=reverse_feats, size=2, param_attr=fluid.ParamAttr( name="reverse_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="reverse_out_b", initializer=fluid.initializer.Constant(0.))) elif mrc_layer == "capsNet": # 取完整的bert_output,输入胶囊网络 bert_output = bert_encode param_attr = fluid.ParamAttr( name='conv2d.weight', initializer=fluid.initializer.Xavier(uniform=False), learning_rate=0.001) bert_output = fluid.layers.unsqueeze(input=bert_output, axes=[1]) capsules = fluid.layers.conv2d(input=bert_output, num_filters=256, filter_size=32, stride=15, padding="VALID", act="relu", param_attr=param_attr) # (batch_size, 256, 33, 50) primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(capsules, kernel_size=9, stride=2) # (batch_size, 8736, 8, 1) classifierCaps = CapsLayer(num_outputs=args['num_labels'], vec_len=16, with_routing=True, layer_type='FC') caps2 = classifierCaps(caps1) # (batch_size, 3, 16, 1) epsilon = 1e-9 v_length = fluid.layers.sqrt( fluid.layers.reduce_sum( fluid.layers.square(caps2), -2, keep_dim=True) + epsilon) logits = fluid.layers.squeeze(v_length, axes=[2, 3]) elif mrc_layer == "lstm": hidden_size = 128 cell = fluid.layers.LSTMCell(hidden_size=hidden_size) cell_r = fluid.layers.LSTMCell(hidden_size=hidden_size) encoded = bert_encode[:, 1:, :] encoded = fluid.layers.dropout( x=encoded, dropout_prob=0.1, dropout_implementation="upscale_in_train") outputs = fluid.layers.rnn(cell, encoded)[0][:, -1, :] outputs_r = fluid.layers.rnn(cell_r, encoded, is_reverse=True)[0][:, -1, :] outputs = fluid.layers.concat(input=[outputs, outputs_r], axis=1) cls_feats = outputs cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") # fc = fluid.layers.fc(input=cls_feats, size=hidden_size*2) # fc = fluid.layers.dropout( # x=fc, # dropout_prob=0.1, # dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=args['num_labels'], param_attr=fluid.ParamAttr( name="lstm_fc_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="lstm_fc_b", initializer=fluid.initializer.Constant(0.))) # 根据任务返回不同的结果 # 预测任务仅返回dataloader和预测出的每个label对应的概率 if is_prediction: probs = fluid.layers.softmax(logits) return reader, probs, qas_ids # 训练任务则计算loss ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) ce_loss_for_reverse, probs_for_reverse = fluid.layers.softmax_with_cross_entropy( logits=logits_for_reverse, label=labels_for_reverse, return_softmax=True) loss_for_reverse = fluid.layers.mean(x=ce_loss_for_reverse) if args['use_fp16'] and args.loss_scaling > 1.0: loss *= args.loss_scaling num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) accuracy_for_reverse = fluid.layers.accuracy(input=probs_for_reverse, label=labels_for_reverse, total=num_seqs) # 返回dataloader,loss,预测结果,和准确度 return reader, loss + loss_for_reverse, probs, accuracy, accuracy_for_reverse, qas_ids
def __init__(self, args=None, detect_entities=False): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() self.detect_entities = detect_entities if self.detect_entities: self.nlp = spacy.load("en_core_web_lg") else: self.nlp = None self.entities_of_interest = [ "PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "PER", ] logger.info("Loading tokenizer and model...") from .train_funcs import load_state if self.args.model_no == 0: from model.bert import BertModel as Model model = args.model_size #'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="classification", n_classes_=self.args.num_classes, ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="classification", n_classes_=self.args.num_classes, ) self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) self.net.resize_token_embeddings(len(self.tokenizer)) if self.cuda: self.net.cuda() start_epoch, best_pred, amp_checkpoint = load_state( self.net, None, None, self.args, load_best=False ) logger.info("Done!") self.e1_id = self.tokenizer.convert_tokens_to_ids("[E1]") self.e2_id = self.tokenizer.convert_tokens_to_ids("[E2]") self.pad_id = self.tokenizer.pad_token_id self.rm = load_pickle("relations.pkl")
def create_model(args, vocab_size, is_prediction=False, is_validate=False): """ 搭建分类模型 被训练模块和预测模块直接调用 返回相关的计算结果和对应的dataloader对象 :param args: 参数 :param vocab_size: 词典大小,用于构建词嵌入层。注意当参数设置词典大小时,该项无效 :param is_prediction: 是否是预测模式,将禁用dropout等。 :param is_validate: 是否是验证模式,除了禁用dropout,还将返回loss和acc,如果输入数据中没有对应项,则会报错。 :return: """ # 处理词典大小 if args['vocab_size'] > 0: vocab_size = args['vocab_size'] # 输入定义 qas_ids = fluid.data(name='qas_ids', dtype='int64', shape=[-1, 1]) src_ids = fluid.data(name='src_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) pos_ids = fluid.data(name='pos_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) sent_ids = fluid.data(name='sent_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) input_mask = fluid.data(name='input_mask', dtype='float32', shape=[-1, args['max_seq_length'], 1]) # 根据任务的不同调整所需的数据,预测任务相比训练任务缺少label这一项数据 labels = fluid.data(name='labels', dtype='int64', shape=[-1, 1]) # engineer_ids = fluid.data(name='engineer_ids', dtype='int64', shape=[-1, args['max_seq_length']+1, 1]) engineer_ids = fluid.data(name='engineer_ids', dtype='int64', shape=[-1, args['max_seq_length'], 1]) config = args if is_prediction: feed_list = [qas_ids, src_ids, pos_ids, sent_ids, input_mask] else: feed_list = [qas_ids, src_ids, pos_ids, sent_ids, input_mask, labels] if config['use_engineer']: feed_list.append(engineer_ids) reader = fluid.io.DataLoader.from_generator(feed_list=feed_list, capacity=64, iterable=True) # 模型部分 # 由bert后接一层全连接完成预测任务 # bert部分 config['vocab_size'] = vocab_size bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=config, use_fp16=False, is_prediction=(is_prediction or is_validate)) mrc_layer = config['mrc_layer'] freeze_pretrained_model = config['freeze_pretrained_model'] cls_feats = bert.get_pooled_output() bert_encode = bert.get_sequence_output() if freeze_pretrained_model: cls_feats.stop_gradient = True bert_encode.stop_gradient = True if config['use_engineer']: # entity_sim = engineer_ids[:,-1,:] # entity_sim_code = fluid.layers.one_hot(input=entity_sim, depth=2, allow_out_of_range=False) # engineer_emb = fluid.layers.embedding(input=engineer_ids[:,:-1,:], size=[32, 8]) engineer_emb = fluid.layers.embedding(input=engineer_ids, size=[32, 8]) bert_encode = fluid.layers.concat(input=[bert_encode, engineer_emb], axis=-1) logits = None if mrc_layer == "cls_fc": # 取[CLS]的输出经全连接进行预测 cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, is_test=(is_prediction or is_validate), dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=args['num_labels'], param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) elif mrc_layer == "capsNet": # 取完整的bert_output,输入胶囊网络 bert_output = bert_encode param_attr = fluid.ParamAttr( name='conv2d.weight', initializer=fluid.initializer.Xavier(uniform=False), learning_rate=0.001) bert_output = fluid.layers.unsqueeze(input=bert_output, axes=[1]) capsules = fluid.layers.conv2d(input=bert_output, num_filters=256, filter_size=32, stride=15, padding="VALID", act="relu", param_attr=param_attr) # (batch_size, 256, 33, 50) primaryCaps = CapsLayer(num_outputs=32, vec_len=8, with_routing=False, layer_type='CONV') caps1 = primaryCaps(capsules, kernel_size=9, stride=2) # (batch_size, 8736, 8, 1) classifierCaps = CapsLayer(num_outputs=args['num_labels'], vec_len=16, with_routing=True, layer_type='FC') caps2 = classifierCaps(caps1) # (batch_size, 3, 16, 1) epsilon = 1e-9 v_length = fluid.layers.sqrt( fluid.layers.reduce_sum( fluid.layers.square(caps2), -2, keep_dim=True) + epsilon) logits = fluid.layers.squeeze(v_length, axes=[2, 3]) elif mrc_layer == "lstm": hidden_size = args['lstm_hidden_size'] cell = fluid.layers.LSTMCell(hidden_size=hidden_size) cell_r = fluid.layers.LSTMCell(hidden_size=hidden_size) encoded = bert_encode[:, 1:, :] encoded = fluid.layers.dropout( x=encoded, is_test=(is_prediction or is_validate), dropout_prob=0.1, dropout_implementation="upscale_in_train") outputs = fluid.layers.rnn(cell, encoded)[0][:, -1, :] outputs_r = fluid.layers.rnn(cell_r, encoded, is_reverse=True)[0][:, -1, :] outputs = fluid.layers.concat(input=[outputs, outputs_r], axis=1) cls_feats = outputs cls_feats = fluid.layers.dropout( x=cls_feats, is_test=(is_prediction or is_validate), dropout_prob=0.1, dropout_implementation="upscale_in_train") # fc = fluid.layers.fc(input=cls_feats, size=hidden_size*2) # fc = fluid.layers.dropout( # x=fc, # dropout_prob=0.1, # dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=args['num_labels'], param_attr=fluid.ParamAttr( name="lstm_fc_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="lstm_fc_b", initializer=fluid.initializer.Constant(0.))) elif mrc_layer == "highway_lstm": hidden_size = 128 cell = fluid.layers.LSTMCell(hidden_size=hidden_size) cell_r = fluid.layers.LSTMCell(hidden_size=hidden_size) encoded = bert_encode[:, 1:, :] encoded = fluid.layers.dropout( x=encoded, is_test=(is_prediction or is_validate), dropout_prob=0.1, dropout_implementation="upscale_in_train") encoded = highway_layer(encoded, name="highway1", num_flatten_dims=2) encoded = fluid.layers.dropout( x=encoded, is_test=(is_prediction or is_validate), dropout_prob=0.1, dropout_implementation="upscale_in_train") outputs = fluid.layers.rnn(cell, encoded)[0][:, -1, :] outputs_r = fluid.layers.rnn(cell_r, encoded, is_reverse=True)[0][:, -1, :] outputs = fluid.layers.concat(input=[outputs, outputs_r], axis=1) cls_feats = outputs cls_feats = fluid.layers.dropout( x=cls_feats, is_test=(is_prediction or is_validate), dropout_prob=0.1, dropout_implementation="upscale_in_train") # fc = fluid.layers.fc(input=cls_feats, size=hidden_size*2) # fc = fluid.layers.dropout( # x=fc, # dropout_prob=0.1, # dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=args['num_labels'], param_attr=fluid.ParamAttr( name="lstm_fc_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="lstm_fc_b", initializer=fluid.initializer.Constant(0.))) # 根据任务返回不同的结果 # 预测任务仅返回dataloader和预测出的每个label对应的概率 if is_prediction and not is_validate: probs = fluid.layers.softmax(logits) return reader, probs, qas_ids # 训练任务则计算loss ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) # loss = fluid.layers.mean(x=ce_loss) weight = fluid.layers.assign(np.array([[1.], [1.], [1.3]], dtype='float32')) def lossweighed(ce_loss, labels): one_hot = fluid.one_hot(input=labels, depth=args["num_labels"]) lw = fluid.layers.matmul(one_hot, weight) lw = fluid.layers.reduce_sum(lw, dim=1) loss = fluid.layers.elementwise_mul(lw, ce_loss) loss = fluid.layers.mean(loss) return loss loss = lossweighed(ce_loss, labels) if args['use_fp16'] and args.loss_scaling > 1.0: loss *= args.loss_scaling num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) # 返回dataloader,loss,预测结果,和准确度 return reader, loss, probs, accuracy, qas_ids
def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False): if is_training: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, max_wn_concept_length, 1], [-1, args.max_seq_len, max_nell_concept_length, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64' ], lod_levels=[0, 0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions, end_positions) = fluid.layers.read_file(pyreader) else: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, max_wn_concept_length, 1], [-1, args.max_seq_len, max_nell_concept_length, 1], [-1, args.max_seq_len, 1], [-1, 1]], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64' ], lod_levels=[0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) '''1st Layer: BERT Layer''' bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() if freeze: enc_out.stop_gradient = True logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient)) '''2nd layer: Memory Layer''' # get memory embedding wn_concept_vocab_size = wn_concept_embedding_mat.shape[0] wn_concept_dim = wn_concept_embedding_mat.shape[1] nell_concept_vocab_size = nell_concept_embedding_mat.shape[0] nell_concept_dim = nell_concept_embedding_mat.shape[1] wn_memory_embs = fluid.layers.embedding( wn_concept_ids, size=(wn_concept_vocab_size, wn_concept_dim), param_attr=fluid.ParamAttr(name="wn_concept_emb_mat", do_model_average=False, trainable=False), dtype='float32') nell_memory_embs = fluid.layers.embedding( nell_concept_ids, size=(nell_concept_vocab_size, nell_concept_dim), param_attr=fluid.ParamAttr(name="nell_concept_emb_mat", do_model_average=False, trainable=False), dtype='float32') # get memory length wn_concept_ids_reduced = fluid.layers.equal( wn_concept_ids, fluid.layers.fill_constant( shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1] wn_concept_ids_reduced = fluid.layers.cast( wn_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1] wn_concept_ids_reduced = fluid.layers.scale(fluid.layers.elementwise_sub( wn_concept_ids_reduced, fluid.layers.fill_constant([1], "float32", 1)), scale=-1) wn_mem_length = fluid.layers.reduce_sum( wn_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1] nell_concept_ids_reduced = fluid.layers.equal( nell_concept_ids, fluid.layers.fill_constant( shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1] nell_concept_ids_reduced = fluid.layers.cast( nell_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1] nell_concept_ids_reduced = fluid.layers.scale(fluid.layers.elementwise_sub( nell_concept_ids_reduced, fluid.layers.fill_constant([1], "float32", 1)), scale=-1) nell_mem_length = fluid.layers.reduce_sum( nell_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1] # select and integrate wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn') wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True) nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell') nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True) memory_output = fluid.layers.concat( [enc_out, wn_memory_output, nell_memory_output], axis=2) '''3rd layer: Self-Matching Layer''' # calculate input dim for self-matching layer memory_output_size = bert_config[ 'hidden_size'] + wn_concept_dim + nell_concept_dim logger.info("memory_output_size: {}".format(memory_output_size)) # do matching self_att_layer = TriLinearTwoTimeSelfAttentionLayer( memory_output_size, dropout_rate=0.0, cat_mul=True, cat_sub=True, cat_twotime=True, cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs] att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs] '''4th layer: Output Layer''' logits = fluid.layers.fc( input=att_output, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.NormalInitializer( loc=0.0, scale=bert_config['initializer_range'])), bias_attr=fluid.ParamAttr(name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions): loss = fluid.layers.softmax_with_cross_entropy(logits=logits, label=positions) loss = fluid.layers.mean(x=loss) return loss start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 if args.use_fp16 and args.loss_scaling > 1.0: total_loss = total_loss * args.loss_scaling return pyreader, total_loss, num_seqs else: return pyreader, unique_id, start_logits, end_logits, num_seqs
def __init__(self, args=None): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() if self.args.model_no == 0: from model.bert import BertModel as Model from model.bert_tokenizer import BertTokenizer as Tokenizer model = args.model_size #'bert-large-uncased' 'bert-base-uncased' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif self.args.model_no == 1: from model.albert.albert import AlbertModel as Model from model.albert.albert_tokenizer import ( AlbertTokenizer as Tokenizer, ) model = args.model_size #'albert-base-v2' model_name = "BERT" self.net = Model.from_pretrained( model, force_download=False, model_size=args.model_size, task="fewrel", ) elif args.model_no == 2: # BioBert from model.bert import BertModel, BertConfig from model.bert_tokenizer import BertTokenizer as Tokenizer model = "bert-base-uncased" model_name = "BioBERT" config = BertConfig.from_pretrained( "./additional_models/biobert_v1.1_pubmed/bert_config.json" ) self.net = BertModel.from_pretrained( pretrained_model_name_or_path="./additional_models/biobert_v1.1_pubmed/biobert_v1.1_pubmed.bin", config=config, force_download=False, model_size="bert-base-uncased", task="fewrel", ) if os.path.isfile("./data/%s_tokenizer.pkl" % model_name): self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) logger.info("Loaded tokenizer from saved file.") else: logger.info( "Saved tokenizer not found, initializing new tokenizer..." ) if args.model_no == 2: self.tokenizer = Tokenizer( vocab_file="./additional_models/biobert_v1.1_pubmed/vocab.txt", do_lower_case=False, ) else: self.tokenizer = Tokenizer.from_pretrained( model, do_lower_case=False ) self.tokenizer.add_tokens( ["[E1]", "[/E1]", "[E2]", "[/E2]", "[BLANK]"] ) save_as_pickle("%s_tokenizer.pkl" % model_name, self.tokenizer) logger.info( "Saved %s tokenizer at ./data/%s_tokenizer.pkl" % (model_name, model_name) ) self.net.resize_token_embeddings(len(self.tokenizer)) self.pad_id = self.tokenizer.pad_token_id if self.cuda: self.net.cuda() if self.args.use_pretrained_blanks == 1: logger.info( "Loading model pre-trained on blanks at ./data/test_checkpoint_%d.pth.tar..." % args.model_no ) checkpoint_path = ( "./data/test_checkpoint_%d.pth.tar" % self.args.model_no ) checkpoint = torch.load(checkpoint_path) model_dict = self.net.state_dict() pretrained_dict = { k: v for k, v in checkpoint["state_dict"].items() if k in model_dict.keys() } model_dict.update(pretrained_dict) self.net.load_state_dict(pretrained_dict, strict=False) del checkpoint, pretrained_dict, model_dict logger.info("Loading Fewrel dataloaders...") self.train_loader, _, self.train_length, _ = load_dataloaders(args)
def __init__(self, l2renorm, expert_dims, tokenizer, keep_missing_modalities, test_caption_mode, freeze_weights=False, mimic_ce_dims=False, concat_experts=False, concat_mix_experts=False, use_experts='origfeat', txt_inp=None, txt_agg=None, txt_pro=None, txt_wgh=None, vid_inp=None, vid_cont=None, vid_wgh=None, pos_enc=None, out_tok=None, use_mask='nomask', same_dim=512, vid_bert_params=None, txt_bert_params=None, agg_dims=None, device=None, normalize_experts=True): super().__init__() self.sanity_checks = False modalities = list(expert_dims.keys()) self.expert_dims = expert_dims self.modalities = modalities logger.debug(self.modalities) self.mimic_ce_dims = mimic_ce_dims self.concat_experts = concat_experts self.concat_mix_experts = concat_mix_experts self.test_caption_mode = test_caption_mode self.freeze_weights = freeze_weights self.use_experts = use_experts self.use_mask = use_mask self.keep_missing_modalities = keep_missing_modalities self.l2renorm = l2renorm self.same_dim = same_dim self.txt_inp = txt_inp self.txt_agg = txt_agg self.txt_pro = txt_pro self.txt_wgh = txt_wgh self.vid_inp = vid_inp self.vid_cont = vid_cont self.vid_wgh = vid_wgh self.pos_enc = pos_enc self.out_tok = out_tok self.vid_bert_params = vid_bert_params self.normalize_experts = normalize_experts self.text_modalities = self.modalities.copy() self.text_modalities.append('total') self.video_dim_reduce = nn.ModuleDict() for mod in self.modalities: in_dim = expert_dims[mod]['dim'] if self.vid_inp in ['agg', 'both', 'all', 'temp']: self.video_dim_reduce[mod] = ReduceDim(in_dim, same_dim) if self.vid_cont == 'coll': self.g_reason_1 = nn.Linear(same_dim * 2, same_dim) dout_prob = vid_bert_params['hidden_dropout_prob'] self.coll_g_dropout = nn.Dropout(dout_prob) self.g_reason_2 = nn.Linear(same_dim, same_dim) self.f_reason_1 = nn.Linear(same_dim, same_dim) self.coll_f_dropout = nn.Dropout(dout_prob) self.f_reason_2 = nn.Linear(same_dim, same_dim) self.f_reason_3 = nn.Linear(same_dim, same_dim) self.batch_norm_g1 = nn.BatchNorm1d(same_dim) self.batch_norm_g2 = nn.BatchNorm1d(same_dim) self.batch_norm_f1 = nn.BatchNorm1d(same_dim) self.batch_norm_f2 = nn.BatchNorm1d(same_dim) self.video_gu = nn.ModuleDict() for mod in self.modalities: self.video_GU[mod] = GatedEmbeddingUnitReasoning(same_dim) # If Bert architecture is employed for video elif self.vid_cont == 'bert': vid_bert_config = types.SimpleNamespace(**vid_bert_params) self.vid_bert = nn.ModuleDict() for mod in self.text_modalities: self.vid_bert[mod] = BertModel(vid_bert_config) elif self.vid_cont == 'none': pass if self.txt_agg[:4] in ['bert']: z = re.match(r'bert([a-z]{3})(\d*)(\D*)', txt_agg) assert z state = z.groups()[0] freeze_until = z.groups()[1] # Post aggregation: Use [CLS] token ("cls") or aggregate all tokens # (mxp, mnp) if z.groups()[2] and z.groups()[2] != 'cls': self.post_agg = z.groups()[2] else: self.post_agg = 'cls' if state in ['ftn', 'frz']: # State is finetune or frozen, we use a pretrained bert model txt_bert_config = 'bert-base-uncased' # Overwrite config if txt_bert_params is None: dout_prob = vid_bert_params['hidden_dropout_prob'] txt_bert_params = { 'hidden_dropout_prob': dout_prob, 'attention_probs_dropout_prob': dout_prob, } self.txt_bert = TxtBertModel.from_pretrained( txt_bert_config, cache_dir= '/youtu_pedestrian_detection/wenzhewang/mmt_data/cache_dir', **txt_bert_params) if state == 'frz': if freeze_until: # Freeze only certain layers freeze_until = int(freeze_until) logger.debug( 'Freezing text bert until layer %d excluded', freeze_until) # Freeze net until given layer for name, param in self.txt_bert.named_parameters(): module = name.split('.')[0] if name.split('.')[2].isdigit(): layer_nb = int(name.split('.')[2]) else: continue if module == 'encoder' and layer_nb in range( freeze_until): param.requires_grad = False logger.debug(name) else: # Freeze the whole model for name, param in self.txt_bert.named_parameters(): module = name.split('.')[0] if module == 'encoder': param.requires_grad = False else: assert not freeze_until if self.txt_inp == 'bertfrz': # Freeze model for param in self.txt_bert.embeddings.parameters(): param.requires_grad = False elif self.txt_inp not in ['bertftn']: logger.error('Wrong parameter for the text encoder') text_dim = self.txt_bert.config.hidden_size elif self.txt_agg in ['vlad', 'mxp', 'mnp', 'lstm']: # Need to get text embeddings if self.txt_inp == 'bertfrz': ckpt = '/youtu_pedestrian_detection/wenzhewang/mmt_data/word_embeddings/bert/ckpt_from_huggingface.pth' self.word_embeddings = TxtEmbeddings(ckpt=ckpt, freeze=True) elif self.txt_inp == 'bertftn': ckpt = '/youtu_pedestrian_detection/wenzhewang/mmt_data/word_embeddings/bert/ckpt_from_huggingface.pth' self.word_embeddings = TxtEmbeddings(ckpt=ckpt) elif self.txt_inp == 'bertscr': vocab_size = 28996 emb_dim = 768 self.word_embeddings = TxtEmbeddings(vocab_size, emb_dim) else: self.word_embeddings = tokenizer.we_model emb_dim = self.word_embeddings.text_dim if self.txt_agg == 'vlad': self.text_pooling = NetVLAD( feature_size=emb_dim, cluster_size=28, ) text_dim = self.text_pooling.out_dim elif self.txt_agg == 'mxp': text_dim = emb_dim elif self.txt_agg == 'lstm': input_dim = self.word_embeddings.text_dim hidden_dim = 512 layer_dim = 1 output_dim = hidden_dim self.text_pooling = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim) text_dim = output_dim self.text_gu = nn.ModuleDict() if self.txt_pro == 'gbn': for mod in self.text_modalities: self.text_gu[mod] = GatedEmbeddingUnit( text_dim, same_dim, max_text_words=30, dim=2, use_bn=True, normalize=self.normalize_experts) self.text_gu['align'] = GatedEmbeddingUnit( text_dim, same_dim, max_text_words=30, dim=3, use_bn=True, normalize=self.normalize_experts) elif self.txt_pro == 'gem': for mod in self.text_modalities: self.text_gu[mod] = GatedEmbeddingUnit( text_dim, same_dim, max_text_words=30, dim=2, use_bn=False, normalize=self.normalize_experts) self.text_gu['align'] = GatedEmbeddingUnit( text_dim, same_dim, max_text_words=30, dim=3, use_bn=False, normalize=self.normalize_experts) elif self.txt_pro == 'lin': for mod in self.text_modalities: self.text_gu[mod] = ReduceDim(text_dim, same_dim) self.text_gu['align'] = ReduceDim(text_dim, same_dim) # Weightening of each modality similarity if self.txt_wgh == 'emb': self.moe_fc_txt = nn.ModuleDict() dout_prob = txt_bert_params['hidden_dropout_prob'] self.moe_txt_dropout = nn.Dropout(dout_prob) for mod in self.text_modalities: self.moe_fc_txt[mod] = nn.Linear(text_dim, 1) if self.vid_wgh == 'emb': self.moe_fc_vid = nn.ModuleDict() dout_prob = vid_bert_params['hidden_dropout_prob'] self.moe_vid_dropout = nn.Dropout(dout_prob) for mod in self.modalities: self.moe_fc_vid[mod] = nn.Linear(self.same_dim, 1) self.debug_dataloader = False if self.debug_dataloader: self.tokenizer = tokenizer
def __init__(self, modalities, expert_dims, same_dim, vid_inp, vid_cont, vid_wgh, vid_bert_params, pos_enc, out_tok, keep_missing_modalities): """modalities: all modalities used to form video features expert_dims: dict, the feature dimension for each modality same_dim: the dimension of the common space vid_inp: video vid_cont: the model used to embed the features (coll: collaborative gating; bert) vid_wgh: the method to compute the weight pos_enc: used in vid_cont=bert""" super().__init__() self.modalities = modalities self.expert_dims = expert_dims self.same_dim = same_dim self.vid_inp = vid_inp self.vid_cont = vid_cont self.vid_wgh = vid_wgh self.vid_bert_params = vid_bert_params self.pos_enc = pos_enc self.out_tok = out_tok self.keep_missing_modalities = keep_missing_modalities self.video_dim_reduce = nn.ModuleDict() for mod in self.modalities: in_dim = expert_dims[mod]['dim'] if self.vid_inp in ['agg', 'both', 'all', 'temp']: self.video_dim_reduce[mod] = ReduceDim(in_dim, same_dim) if self.vid_cont == 'coll': self.g_reason_1 = nn.Linear(same_dim * 2, same_dim) dout_prob = vid_bert_params['hidden_dropout_prob'] self.coll_g_dropout = nn.Dropout(dout_prob) self.g_reason_2 = nn.Linear(same_dim, same_dim) self.f_reason_1 = nn.Linear(same_dim, same_dim) self.coll_f_dropout = nn.Dropout(dout_prob) self.f_reason_2 = nn.Linear(same_dim, same_dim) self.f_reason_3 = nn.Linear(same_dim, same_dim) self.batch_norm_g1 = nn.BatchNorm1d(same_dim) self.batch_norm_g2 = nn.BatchNorm1d(same_dim) self.batch_norm_f1 = nn.BatchNorm1d(same_dim) self.batch_norm_f2 = nn.BatchNorm1d(same_dim) self.video_GU = nn.ModuleDict() for mod in self.modalities: self.video_GU[mod] = GatedEmbeddingUnitReasoning(same_dim) # If Bert architecture is employed for video elif self.vid_cont == 'bert': vid_bert_config = types.SimpleNamespace(**vid_bert_params) self.vid_bert = BertModel(vid_bert_config) elif self.vid_cont == 'none': pass if self.vid_wgh == 'emb': self.moe_fc_vid = nn.ModuleDict() dout_prob = vid_bert_params['hidden_dropout_prob'] self.moe_vid_dropout = nn.Dropout(dout_prob) for mod in self.modalities: self.moe_fc_vid[mod] = nn.Linear(self.same_dim, 1)
def create_model(pyreader_name, bert_config, is_training=False): if is_training: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = fluid.layers.read_file(pyreader) else: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], lod_levels=[0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() logits = fluid.layers.fc( input=enc_out, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions): loss = fluid.layers.softmax_with_cross_entropy(logits=logits, label=positions) loss = fluid.layers.mean(x=loss) return loss start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 if args.use_fp16 and args.loss_scaling > 1.0: total_loss = total_loss * args.loss_scaling return pyreader, total_loss, num_seqs else: return pyreader, unique_id, start_logits, end_logits, num_seqs
def create_model(bert_config, is_training=False): if is_training: input_fields = { 'names': [ 'src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions' ], 'shapes': [[None, None], [None, None], [None, None], [None, None, 1], [None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0, 0], } else: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], 'shapes': [[None, None], [None, None], [None, None], [None, None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=False) if is_training: (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs else: (src_ids, pos_ids, sent_ids, input_mask, unique_id) = inputs bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() logits = fluid.layers.fc( input=enc_out, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions): loss = fluid.layers.softmax_with_cross_entropy(logits=logits, label=positions) loss = fluid.layers.mean(x=loss) return loss start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 return data_loader, total_loss, num_seqs else: return data_loader, unique_id, start_logits, end_logits, num_seqs
def create_model(args, bert_config, num_labels, is_prediction=False): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ fluid.layers.data( name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) cls_feats = bert.get_pooled_output() cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) if is_prediction: probs = fluid.layers.softmax(logits) feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] return pyreader, probs, feed_targets_name ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) if args.use_fp16 and args.loss_scaling > 1.0: loss *= args.loss_scaling num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) return pyreader, loss, probs, accuracy, num_seqs
def create_model(pyreader_name, bert_config, is_training=False): if is_training: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, args.max_seq_len], [-1, args.max_seq_len], [-1, 1], [-1, args.max_seq_len]], dtypes=[ 'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'float32', 'float32', 'float32', 'float32' ], lod_levels=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions, KD_start_logits, KD_end_logits, la, loss_weights) = fluid.layers.read_file(pyreader) else: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float32', 'int64'], lod_levels=[0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() logits = fluid.layers.fc( input=enc_out, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions, loss_weights): logits = fluid.layers.softmax(logits) loss = fluid.layers.cross_entropy( input=logits, label=positions) * loss_weights[:, 0] loss = fluid.layers.mean(x=loss) return loss # KLloss_start = fluid.layers.kldiv_loss(x=start_logits, target=KD_start_logits, reduction='mean') # KLloss_end = fluid.layers.kldiv_loss(x=end_logits, target=KD_end_logits, reduction='mean') # KLloss = (KLloss_start + KLloss_end) / 2.0 KD_loss_mask = fluid.layers.cast(KD_start_logits < 999999999, 'int64') def diff_loss(batched_a, batched_b, KD_loss_mask, loss_weights): diff = batched_a - batched_b loss = diff * diff * KD_loss_mask * loss_weights loss = fluid.layers.reduce_sum(loss) / fluid.layers.reduce_sum( KD_loss_mask) return loss start_loss = compute_loss(start_logits, start_positions, loss_weights) end_loss = compute_loss(end_logits, end_positions, loss_weights) KDloss_start = diff_loss(start_logits, KD_start_logits, KD_loss_mask, loss_weights) KDloss_end = diff_loss(end_logits, KD_end_logits, KD_loss_mask, loss_weights) KDloss = (KDloss_start + KDloss_end) / 2.0 total_loss = (1 - la) * (start_loss + end_loss) / 2.0 + la * KDloss if args.use_fp16 and args.loss_scaling > 1.0: total_loss = total_loss * args.loss_scaling return pyreader, total_loss, num_seqs else: return pyreader, unique_id, start_logits, end_logits, num_seqs
def __init__(self, bert_config, num_labels): super(Classifier, self).__init__() self.bert = BertModel(bert_config) self.cls_out = nn.Linear(bert_config['hidden_size'], num_labels)
def create_model(args, pyreader_name, bert_config, num_labels, is_prediction=False): """ define fine-tuning model """ if args.binary: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, input_mask, seq_len, labels) = fluid.layers.read_file(pyreader) bert = BertModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) if args.sub_model_type == 'raw': cls_feats = bert.get_pooled_output() elif args.sub_model_type == 'cnn': bert_seq_out = bert.get_sequence_output() bert_seq_out = fluid.layers.sequence_unpad(bert_seq_out, seq_len) cnn_hidden_size = 100 convs = [] for h in [3, 4, 5]: conv_feats = fluid.layers.sequence_conv( input=bert_seq_out, num_filters=cnn_hidden_size, filter_size=h) conv_feats = fluid.layers.batch_norm(input=conv_feats, act="relu") conv_feats = fluid.layers.sequence_pool( input=conv_feats, pool_type='max') convs.append(conv_feats) cls_feats = fluid.layers.concat(input=convs, axis=1) elif args.sub_model_type == 'gru': bert_seq_out = bert.get_sequence_output() bert_seq_out = fluid.layers.sequence_unpad(bert_seq_out, seq_len) gru_hidden_size = 1024 gru_input = fluid.layers.fc(input=bert_seq_out, size=gru_hidden_size * 3) gru_forward = fluid.layers.dynamic_gru( input=gru_input, size=gru_hidden_size, is_reverse=False) gru_backward = fluid.layers.dynamic_gru( input=gru_input, size=gru_hidden_size, is_reverse=True) gru_output = fluid.layers.concat([gru_forward, gru_backward], axis=1) cls_feats = fluid.layers.sequence_pool( input=gru_output, pool_type='max') elif args.sub_model_type == 'ffa': bert_seq_out = bert.get_sequence_output() attn = fluid.layers.fc(input=bert_seq_out, num_flatten_dims=2, size=1, act='tanh') attn = fluid.layers.softmax(attn) weighted_input = bert_seq_out * attn weighted_input = fluid.layers.sequence_unpad(weighted_input, seq_len) cls_feats = fluid.layers.sequence_pool(weighted_input, pool_type='sum') else: raise NotImplementedError("%s is not implemented!" % args.sub_model_type) cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) probs = fluid.layers.softmax(logits) if is_prediction: feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] return pyreader, probs, feed_targets_name ce_loss = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels) loss = fluid.layers.mean(x=ce_loss) if args.use_fp16 and args.loss_scaling > 1.0: loss *= args.loss_scaling num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) return (pyreader, loss, probs, accuracy, labels, num_seqs)
def create_model(args, bert_config, num_labels, is_prediction=False, k=0, n=0, q=0, task_name=""): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], 'shapes': [[None, None], [None, None], [None, None], [None, None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ fluid.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs data_loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=50, iterable=True) bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) cls_feats = bert.get_pooled_output() cls_feats = fluid.layers.dropout(x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") hidden = fluid.layers.fc( input=cls_feats, num_flatten_dims=2, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_out_b", initializer=fluid.initializer.Constant(0.))) if is_prediction: probs = fluid.layers.softmax(logits) feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] return data_loader, probs, feed_targets_name #fluid.layers.Print(hidden) logits = fluid.layers.softmax(hidden) if task_name == "fewshot": #fluid.layers.Print(logits) #fluid.layers.Print(labels) logits = fluid.layers.reshape(hidden, [-1, num_labels], inplace=True) logits = fluid.layers.reshape(logits, [-1, q * k, k, n, 2], inplace=True) logits = fluid.layers.reduce_mean(logits, dim=3, keep_dim=False) logits = logits[:, :, :, 1] logits = fluid.layers.reshape(logits, [-1, q * k, k], inplace=True) logits = fluid.layers.reshape(logits, [-1, k], inplace=True) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, k=1, total=num_seqs) return data_loader, loss, probs, accuracy, num_seqs elif task_name == "fintune": #fluid.layers.Print(labels) logits = fluid.layers.reshape(hidden, [-1, num_labels], inplace=True) #fluid.layers.Print(logits) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) #fluid.layers.Print(ce_loss) loss = fluid.layers.mean(x=ce_loss) num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, k=1, total=num_seqs) return data_loader, loss, probs, accuracy, num_seqs else: return