def total_parameters_transformer(share_parameter_across_layers): input_tensor = tf.zeros((batch_size, sequence_length, hidden_size), dtype=tf.float32) print("transformer_model. input:", input_tensor) transformer_result = transformer_model( input_tensor, hidden_size=hidden_size, num_attention_heads=num_attention_heads, share_parameter_across_layers=share_parameter_across_layers) print("transformer_result:", transformer_result) total_parameters = get_total_parameters() print('total_parameters(not share):', total_parameters)
def build_encoder(self, features): hparams = self.hparams # Here we expect features to have 'sequence' and 'attention_mask' with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): # import pdb; pdb.set_trace() sequence = features['sequence'] # [batch, seq_len=128] # types of entity: Point, Line, Segment, Halfplane, etc. embedding_output, _ = modeling.embedding_lookup( input_ids=sequence, vocab_size=hparams.entity_num_type, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='entity_type_embedding', ) # [batch, seq_len, hid_size] # Next we add a "type" to indicate which # object in the sequence is of problem state, and # which is the goal object. encoder_input = modeling.embedding_postprocessor( input_tensor=embedding_output, sequence_ids=sequence, hparams=self.hparams) # [batch, seq_len, hid_size] # Next we feed the sequence into encoder transformer # with the corresponding attention mask. with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): # [batch, seq_len, seq_len] attention_mask = dec_to_bin_att_mask(features['attention_mask']) all_encoder_layers = modeling.transformer_model( input_tensor=encoder_input, # [batch, seq_len, hid_size] attention_mask=attention_mask, # [batch, seq_len, seq_len] hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_encode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k, densify_attention_mask=hparams.densify_attention_mask) sequence_output, attention_weights = all_encoder_layers[ -1] # [batch seq_len hid_size] cls_vector = sequence_output[:, 0:1, :] # [batch 1 hid_size] return sequence_output, cls_vector, attention_weights
def transformer(input_q, input_d, len_q, len_d): """Use the transformer code from google BERT """ with tf.variable_scope("embed_q"): raw_mask_q = tf.cast(tf.sequence_mask(len_q), tf.float32) attention_mask_q = create_attention_mask_from_input_mask( from_tensor=input_q, to_mask=raw_mask_q) embed_q_all = transformer_model(input_tensor=input_q, attention_mask=attention_mask_q, hidden_size=64, num_hidden_layers=4, num_attention_heads=2, intermediate_size=128, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=True) embed_q = embed_q_all[-1] with tf.variable_scope("embed_d"): raw_mask_d = tf.cast(tf.sequence_mask(len_d), tf.float32) attention_mask_d = create_attention_mask_from_input_mask( from_tensor=input_d, to_mask=raw_mask_d) embed_d_all = transformer_model(input_tensor=input_d, attention_mask=attention_mask_d, hidden_size=64, num_hidden_layers=4, num_attention_heads=2, intermediate_size=128, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=True) embed_d = embed_d_all[-1] return embed_q, embed_d
def __init__(self, config, input_embedding, attention_mask): # Keep variable names the same as BERT with tf.variable_scope("bert"): with tf.variable_scope("encoder"): all_encoder_layers = modeling.transformer_model( input_tensor=input_embedding, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = all_encoder_layers[-1]
def feed_neural_work(self): ''' input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False''' # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, self.context_bias = modeling.transformer_model( self.embedded_chars_q, attention_mask=self.attention_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.config.initializer_range, do_return_all_layers=True, t5_relative_bias=self.t5_att_bias) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained if self.transformer_ret_pooling == "mean": print('self.seq_lent:', self.seq_lent) print('tf.reduce_sum(self.sequence_output,axis=1):', tf.reduce_sum(self.sequence_output, axis=1)) self.pooled_output = tf.reduce_sum(self.sequence_output, axis=1) * self.seq_lent elif self.transformer_ret_pooling == "last": self.pooled_output = self.sequence_output[:, -1, :] elif self.transformer_ret_pooling == "max": self.pooled_output = tf.reduce_max(self.sequence_output, axis=1) else: print('wrong transformer_ret_pooling:', self.transformer_ret_pooling) exit(0) if 'adding_problem' not in self.dataset: #we add dropout for pooled_output self.pooled_output = modeling.layer_norm( tf.nn.dropout(self.pooled_output, keep_prob=1.0 - self.input_dropout_prob)) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[self.config.hidden_size, self.max_input_right], initializer=initializer()) b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]), name="b") l2_loss = tf.constant(0.0) l2_loss += tf.nn.l2_loss(W) self.scores = tf.nn.xw_plus_b(self.pooled_output, W, b, name="scores") print(self.scores) self.predictions = tf.argmax(self.scores, 1, name="predictions") if 'adding_problem' not in self.dataset: # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.l2_loss = l2_loss * self.l2_reg_lambda self.loss = tf.reduce_mean(losses) + self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") else: with tf.name_scope("loss"): losses = tf.nn.l2_loss(self.scores - tf.expand_dims(self.input_y, -1)) print('losses:', losses) self.l2_loss = self.l2_reg_lambda * l2_loss self.loss = tf.reduce_mean(losses) + self.l2_loss * 1e-3 with tf.name_scope("accuracy"): correct_predictions = tf.less_equal( tf.abs(self.scores[:, 0] - self.input_y), tf.constant([0.04])) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def body(self, features): hparams = self.hparams if not self.is_training: hparams.dropout_prob = 0.0 with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): # attention_weights: [batch, n_head, from_len, to_len] sequence_output, cls_vector, attention_weights = self.build_encoder( features) if 'targets' not in features: assert self.hparams.dropout_prob == 0.0 logits, losses = self.greedy_decode_8steps(cls_vector, sequence_output) logits.update(attention_weights=attention_weights[:, :, 0, :]) return logits, losses with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): premise = features[ 'targets'] # [batch, premise_len=8] -bad naming:( # [batch, premise_len, hid_size] premise_vecs = premise_gather_nd(sequence_output, premise) batch_size = tf.shape(premise)[0] premise_len = premise.shape.as_list()[-1] theorem = features['theorem'] # batch, 1 # [batch, 1, hid_size] and [num_theorems, hid_size] theorem_vec, theorem_emb_table = modeling.embedding_lookup( input_ids=theorem, # [batch, 1] vocab_size=hparams.num_theorems, embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='theorem_embedding', ) depth = features['depth'] # batch, 1 decoder_input = tf.concat( [ cls_vector, # [batch, 1, hid_size] theorem_vec, # [batch, 1, hid_size] premise_vecs[:, : -1, :] # [batch, premise_len-1, hid_size] ], axis=1) # [batch, premise_len + 1, hid_size] decode_length = decoder_input.shape.as_list()[1] assert decode_length == premise_len + 1 # [decode_length, hid_size] pos_embedding, _ = modeling.embedding_lookup( input_ids=tf.range(decode_length), # [decode_length] vocab_size=hparams.max_premise, # >= premise_len embedding_size=hparams.hidden_size, initializer_range=hparams.initializer_range, word_embedding_name='positional_embedding', ) pos_embedding = tf.reshape( pos_embedding, [1, decode_length, hparams.hidden_size]) decoder_input = modeling.layer_norm_and_dropout( decoder_input + # [batch, decode_length, hid_size] pos_embedding, # [1, decode_length, hid_size] hparams.dropout_prob) # [batch, decode_length, hid_size] with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE): causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part( rows=decode_length, cols=decode_length, num_lower=-1, # attend to everything before num_upper=0, # attend to nothing after out_shape=[1, decode_length, decode_length ]) # 1, decode_length, decode_length # [batch, decode_length, decode_length] causal_attention_mask = tf.tile(causal_attention_mask, [batch_size, 1, 1]) all_decoder_layers = modeling.transformer_model( input_tensor=decoder_input, attention_mask=causal_attention_mask, hidden_size=hparams.hidden_size, num_hidden_layers=hparams.num_decode_layers, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, intermediate_act_fn=modeling.get_activation( hparams.hidden_act), hidden_dropout_prob=hparams.dropout_prob, attention_probs_dropout_prob=hparams.dropout_prob, initializer_range=hparams.initializer_range, do_return_all_layers=True, attention_top_k=hparams.attention_top_k) decoder_output, _ = all_decoder_layers[ -1] # [batch, dec_len, hid_size] theorem_feature = decoder_output[:, 0, :] # [batch, hid_size] premise_feature = decoder_output[:, 1:, :] # [batch, tar_len, hid_size] with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE): theorem_logits = tf.keras.layers.Dense( # [batch, num_theorems] name='theorem', units=hparams.num_theorems, use_bias=True, kernel_initializer=modeling.create_initializer( hparams.initializer_range))(theorem_feature) premise_logits = tf.matmul( a=premise_feature, # [batch, premise_len, hid_size] b=sequence_output, # [batch, sequence_len, hid_size] transpose_b=True, ) # [batch, premise_len, sequence_len] # [batch * premise_len, sequence_len] seq_len = premise_logits.shape.as_list()[-1] premise_logits = tf.reshape(premise_logits, [-1, seq_len]) premise_weights = tf.cast(premise > 0, tf.float32) # [batch, prem_len] premise_weights = tf.reshape(premise_weights, [-1]) # [batch * prem_len] premise = tf.reshape(premise, [-1, 1]) # [batch * prem_len, 1] theorem_loss = tf.losses.sparse_softmax_cross_entropy( labels=theorem, # [batch, 1] logits=theorem_logits # [batch, num_theorems] ) premise_loss = tf.losses.sparse_softmax_cross_entropy( labels=premise, # [batch * premise_len, 1] logits=premise_logits, # [batch * premise_len, sequence_len] weights=premise_weights # [batch * premise_len] ) logits = dict(theorem_logits=theorem_logits, theorem_labels=theorem, premise_logits=premise_logits, premise_labels=premise) losses = dict(training=theorem_loss + premise_loss, theorem_loss=theorem_loss, premise_loss=premise_loss) return logits, losses
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output()#Bertの最終層 final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) #ここをTransformerにする """ output_weights = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits) """ #Transformer層 #bertの中のtransformerよりずっとスペック低くしている transformer_outputs = modeling.transformer_model(input_tensor=final_hidden_matrix, attention_mask=None, hidden_size=5, num_hidden_layers=2, num_attention_heads=2, intermediate_size=20, intermediate_act_fn=modeling.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False)#現状Falseのみ #線型層 output_weights = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_weights", [30000, 5], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す #変数がないときは定義し、ある時はそれを呼び出す "cls/squad/output_bias", [30000], initializer=tf.zeros_initializer()) logits = tf.matmul(transformer_outputs, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) #max ids = tf.reduce_max(logits,axis=0) #Transformerのテンソルとidを出力。損失を測るのに両方使うため return (ids,transformer_outputs)
def feed_neural_work(self): ''' input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False''' # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, self.context_bias = modeling.transformer_model( self.embedded_chars_q, attention_mask=self.attention_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.config.initializer_range, do_return_all_layers=True, t5_relative_bias=self.t5_att_bias) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope("pooler"): if self.transformer_ret_pooling == "mean": print('self.seq_lent:', self.seq_lent) print('tf.reduce_sum(self.sequence_output,axis=1):', tf.reduce_sum(self.sequence_output, axis=1)) self.pooled_output = tf.reduce_sum(self.sequence_output, axis=1) * self.seq_lent elif self.transformer_ret_pooling == "last": self.pooled_output = self.sequence_output[:, -1, :] elif self.transformer_ret_pooling == "max": self.pooled_output = tf.reduce_max(self.sequence_output, axis=1) else: print('wrong transformer_ret_pooling:', self.transformer_ret_pooling) exit(0) #we add dropout for pooled_output if 'adding_problem' not in self.dataset: self.pooled_output = modeling.layer_norm( tf.nn.dropout(self.pooled_output, keep_prob=1.0 - self.input_dropout_prob)) # Final (unnormalized) scores and predictions with tf.name_scope("output"): W = tf.get_variable( "W", shape=[self.config.hidden_size, self.max_input_right], initializer=initializer(), ) b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]), name="b") l2_loss = tf.constant(0.0) l2_loss += tf.nn.l2_loss(W) self.scores = tf.nn.xw_plus_b(self.pooled_output, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") if 'adding_problem' not in self.dataset: # Calculate mean cross-entropy loss with tf.name_scope("loss"): self.l2_loss = self.l2_reg_lambda * l2_loss losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) #+ self.l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") else: with tf.name_scope("loss"): self.l2_loss = self.l2_reg_lambda * l2_loss losses = tf.nn.l2_loss(self.scores - tf.expand_dims(self.input_y, -1)) print('losses:', losses) self.loss = tf.reduce_mean(losses) #+ self.l2_loss with tf.name_scope("accuracy"): correct_predictions = tf.less_equal( tf.abs(self.scores[:, 0] - self.input_y), tf.constant([0.04])) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def main(args): bert_config = modeling.BertConfig.from_json_file(args.config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 batch_size = args.batch_size avg_seq_len = args.avg_seq_length max_seq_len = args.max_seq_length tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32 # fake input array length input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len, high=max_seq_len + 1, size=(batch_size), dtype=np.int32) valid_word_num = sum(input_len) # fake input id and mask input_ids = np.random.randint(low=0, high=bert_config.vocab_size, size=(batch_size, max_seq_len), dtype=np.int32) input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32) for b_idx, s_len in enumerate(input_len): input_mask[b_idx][:s_len] = 1 input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32) input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32) # fake embedding output embed_output = np.random.randn(batch_size, max_seq_len, bert_config.hidden_size) input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype) # keep attention_mask for compatible reason att_mask = np.tile(input_mask, max_seq_len) att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len) attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype) # input info valid_word_num = sum(input_len) print("Valid word num : {}/{}, avg sequence length : {:.6} ".format( valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size)) # bert with standard transformer std_bert = modeling.transformer_model( input_tensor=input_tensor, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=False) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: # init weights sess.run(tf.global_variables_initializer()) # get transformer weights all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) transformer_vars = [v for v in all_vars if v.name.startswith('layer')] weights_value = sess.run(transformer_vars) # bert with effective transformer et_bert = effective_transformer.get_sequence_output( max_batch_size=batch_size, max_seq_length=max_seq_len, config=bert_config, attention_mask=attention_mask, input_mask=input_mask_tensor, from_tensor=input_tensor, weights_value=weights_value, ) # diff val1 = sess.run(std_bert).reshape(-1, 768) val2 = sess.run(et_bert).reshape(-1, 768) diff = [] for b_idx, s_len in enumerate(input_len): for w_idx in range(s_len): idx = b_idx * args.max_seq_length + w_idx diff.append(np.fabs(val1[idx] - val2[idx]).max()) print("max diff : {:.6}, avg diff : {:.6}.".format( max(diff), sum(diff) / len(diff))) def time_inference(output_tensor): iter_num = 128 # warm up for i in range(10): sess.run(output_tensor) beg = datetime.now() for i in range(iter_num): sess.run(output_tensor) end = datetime.now() return (end - beg).total_seconds() * 1000 / iter_num # ms print("xla cost : {:.6} ms".format(time_inference(std_bert))) print("et cost : {:.6} ms".format(time_inference(et_bert)))
def create_model(self, model_input, vocab_size, num_frames, mix_number=None, cluster_size=None, hidden_size=None, is_training=True, groups=None, expansion=None, drop_rate=None, gating_reduction=None, **unused_params): num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) config = copy.deepcopy(config) config.num_hidden_layers = FLAGS.bert_hidden_layer config.num_attention_heads = FLAGS.bert_attention_heads config.hidden_dropout_prob = FLAGS.bert_dropout_prob config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 #breakpoint() with tf.variable_scope("encoder"): self.all_encoder_layers = modeling.transformer_model( input_tensor=model_input, attention_mask=None, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) model_input = self.all_encoder_layers[-1] if FLAGS.sample_random_frames: model_input = utils.SampleRandomFrames(model_input, num_frames, FLAGS.iterations) else: model_input = utils.SampleRandomSequence(model_input, num_frames, FLAGS.iterations) cluster_size = cluster_size or FLAGS.nextvlad_cluster_size hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size gating_reduction = gating_reduction or FLAGS.gating_reduction groups = groups or FLAGS.groups drop_rate = drop_rate or FLAGS.drop_rate mix_number = mix_number or FLAGS.mix_number expansion = expansion or FLAGS.expansion max_frames = model_input.get_shape().as_list()[1] mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32) ftr_mean = tf.reduce_mean(model_input, axis=-1) ftr_mean = slim.batch_norm(ftr_mean, center=True, scale=True, fused=True, is_training=is_training, scope="mix_weights_bn") mix_weights = slim.fully_connected( ftr_mean, mix_number, activation_fn=None, weights_initializer=slim.variance_scaling_initializer(), scope="mix_weights") mix_weights = tf.nn.softmax(mix_weights, axis=-1) tf.summary.histogram("mix_weights", mix_weights) results = [] for n in range(mix_number): with tf.variable_scope("branch_%d" % n): res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024], audio_ftr=model_input[:, :, 1024:], vocab_size=vocab_size, max_frames=max_frames, cluster_size=cluster_size, groups=groups, expansion=expansion, drop_rate=drop_rate, hidden1_size=hidden1_size, is_training=is_training, gating_reduction=gating_reduction, mask=mask, **unused_params) results.append(res) aux_preds = [res["predictions"] for res in results] logits = [res["logits"] for res in results] logits = tf.stack(logits, axis=1) mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1), logits), axis=1) pred = tf.nn.sigmoid(mix_logit) if is_training: rank_pred = tf.expand_dims(tf.nn.softmax(tf.div( mix_logit, FLAGS.cl_temperature), axis=-1), axis=1) aux_rank_preds = tf.nn.softmax(tf.div(logits, FLAGS.cl_temperature), axis=-1) epsilon = 1e-8 kl_loss = tf.reduce_sum(rank_pred * (tf.log(rank_pred + epsilon) - tf.log(aux_rank_preds + epsilon)), axis=-1) regularization_loss = FLAGS.cl_lambda * tf.reduce_mean( tf.reduce_sum(kl_loss, axis=-1), axis=-1) return { "predictions": pred, "regularization_loss": regularization_loss, "aux_predictions": aux_preds } else: return {"predictions": pred}
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] use_cnn = True use_attention = False use_transformer = False use_dense = False if use_cnn: nb_channels = 3 width = int(math.sqrt(hidden_size / nb_channels)) height = width output_weights = tf.get_variable( "cls/squad/output_weights", [2,width*height], initializer=tf.truncated_normal_initializer(stddev=0.02)) filters = tf.get_variable( "cls/squad/filters", [3,3,3,1], initializer=tf.truncated_normal_initializer(stddev=0.02)) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, width,width,3]) final_hidden_matrix = tf.nn.conv2d(input = final_hidden_matrix,filter=filters,strides=[1, 1, 1, 1],padding="SAME") final_hidden_matrix = tf.reshape(final_hidden_matrix,[batch_size ,seq_length,width*height]) with tf.variable_scope("attention_after_conv2D"): modeling.attention_layer(final_hidden_matrix,final_hidden_matrix) final_hidden_matrix = tf.reshape(final_hidden_matrix,[batch_size *seq_length,width*height]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) if use_attention: with tf.variable_scope("layer_custom_%d" % 1): modeling.attention_layer(final_hidden,final_hidden) with tf.variable_scope("layer_custom_%d" % 2): modeling.attention_layer(final_hidden,final_hidden) with tf.variable_scope("layer_custom_%d" % 3): modeling.attention_layer(final_hidden,final_hidden) if use_transformer: with tf.variable_scope("custom_transformer"): final_hidden = modeling.transformer_model(final_hidden,num_hidden_layers=3,num_attention_heads=3) if use_attention or use_transformer: output_weights = tf.get_variable( "cls/squad/output_weights", [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) output_bias = tf.get_variable( "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) return (start_logits, end_logits)