def scatter_multiple(input_ids, indice, update_vals): batch_size = get_shape_list2(input_ids)[0] seq_length = get_shape_list2(input_ids)[1] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) indices = tf.reshape(indice + flat_offsets, [-1, 1]) tensor = tf.reshape(input_ids, [batch_size * seq_length]) updates = tf.reshape(update_vals, [-1]) flat_output = tf.tensor_scatter_nd_update(tensor, indices, updates) return tf.reshape(flat_output, [batch_size, seq_length])
def call(self, input_ids, input_mask, segment_ids): with tf.compat.v1.variable_scope("embeddings"): self.embedding_layer = Embedding2() input_tensor = self.embedding_layer.apply( input_ids, segment_ids, self.config.initializer_range, self.config.vocab_size, self.config.embedding_size, self.config.type_vocab_size, self.config.max_position_embeddings, self.config.hidden_dropout_prob, self.use_one_hot_embeddings) input_tensor = self.embedding_projection(input_tensor) self.embedding_output = input_tensor input_shape = bc.get_shape_list2(input_tensor) batch_size, seq_length, _ = input_shape with tf.compat.v1.variable_scope("encoder"): self.attention_mask = bc.create_attention_mask_from_input_mask2( input_tensor, input_mask) prev_output = bc.reshape_to_matrix(input_tensor) with tf.compat.v1.variable_scope("layer"): intermediate_output, prev_output = self.layer.apply( prev_output, batch_size, seq_length, self.attention_mask) final_output = bc.reshape_from_matrix2(prev_output, input_shape) self.all_layer_outputs.append(final_output) for layer_idx in range(1, self.config.num_hidden_layers): with tf.compat.v1.variable_scope("layer", reuse=True): intermediate_output, prev_output = self.layer.apply( prev_output, batch_size, seq_length, self.attention_mask) final_output = bc.reshape_from_matrix2( prev_output, input_shape) self.all_layer_outputs.append(final_output) return prev_output
def pooling_modeling(option_name, num_classes, pooled_outputs, sequence_output_3d): def seq_max_pooling(sequence_output_3d): single_rep = tf.reduce_max(sequence_output_3d, axis=2) single_rep = tf.reduce_max(single_rep, axis=1) return single_rep def seq_avg_pooling(sequence_output_3d): single_rep = tf.reduce_mean(sequence_output_3d, axis=2) single_rep = tf.reduce_mean(single_rep, axis=1) return single_rep if option_name == "pooled_max": single_rep = tf.reduce_max(pooled_outputs, axis=1) elif option_name == "pooled_avg": single_rep = tf.reduce_mean(pooled_outputs, axis=1) elif option_name == "seq_max+1" or option_name == "seq_avg+1": batch, num_seg, seq, hidden_dim = get_shape_list2(sequence_output_3d) sequence_rep = tf.keras.layers.Dense( hidden_dim, name="cls_dense")(sequence_output_3d) if option_name == "seq_max+1": single_rep = seq_max_pooling(sequence_rep) elif option_name == "seq_avg+1": single_rep = seq_avg_pooling(sequence_rep) else: assert False elif option_name == "seq_avg": single_rep = seq_avg_pooling(sequence_output_3d) elif option_name == "seq_max": single_rep = seq_max_pooling(sequence_output_3d) else: assert False logits = tf.keras.layers.Dense(num_classes, name="cls_dense")(single_rep) return logits
def get_nli_ex_model_segmented(input_ids, input_mask, segment_ids): method = 5 hp = hyperparams.HPBert() voca_size = 30522 sequence_shape = bert_common.get_shape_list2(input_ids) batch_size = sequence_shape[0] step = 200 pad_len = 200 - 1 - (512 - (step * 2 - 1)) def spread(t): cls_token = t[:, :1] pad = tf.ones([batch_size, pad_len], tf.dtypes.int32) * PAD_ID a = t[:, :step] b = tf.concat([cls_token, t[:, step:step * 2 - 1]], axis=1) c = tf.concat([cls_token, t[:, step * 2 - 1:], pad], axis=1) return tf.concat([a, b, c], axis=0) def collect(t): a = t[:batch_size] b = t[batch_size:batch_size * 2, 1:] c = t[batch_size * 2:, 1:-pad_len] return tf.concat([a, b, c], axis=1) model = transformer_nli(hp, spread(input_ids), spread(input_mask), spread(segment_ids), voca_size, method, False) output = model.conf_logits output = collect(output) return output
def split_and_append_sep2(input_ids, input_mask, segment_ids, seq_length: int, window_length: int, CLS_ID, EOW_ID): special_tokens = 2 # CLS, SEP src_window_length = window_length - special_tokens num_window = int(seq_length / src_window_length) batch_size, _ = bc.get_shape_list2(input_ids) def r2to3(arr): return tf.reshape(arr, [batch_size, num_window, -1]) stacked_input_ids = r2to3( input_ids) # [batch_size, num_window, src_window_length] stacked_input_mask = r2to3( input_mask) # [batch_size, num_window, src_window_length] stacked_segment_ids = r2to3( segment_ids) # [batch_size, num_window, src_window_length] edge_shape = [batch_size, num_window, 1] cls_arr = tf.ones(edge_shape, tf.int32) * CLS_ID eow_arr = tf.ones(edge_shape, tf.int32) * EOW_ID stacked_input_ids = tf.concat([cls_arr, stacked_input_ids, eow_arr], axis=2) mask_edge = tf.ones(edge_shape, tf.int32) stacked_input_mask = tf.concat([mask_edge, stacked_input_mask, mask_edge], axis=2) edge1 = stacked_segment_ids[:, :, 0:1] edge2 = stacked_segment_ids[:, :, -2:-1] stacked_segment_ids = tf.concat([edge1, stacked_segment_ids, edge2], axis=2) return stacked_input_ids, stacked_input_mask, stacked_segment_ids
def call(self, input_vectors, use_context): # input_vectors : [num_window, hidden_size] batch_size, seq_length, hidden_dim = bc.get_shape_list2(input_vectors) # Add position embedding input_vectors = bc.embedding_postprocessor2( input_tensor=input_vectors, token_type_table=self.token_type_table, full_position_embeddings=self.full_position_embeddings, use_token_type=False, token_type_ids=None, token_type_vocab_size=1, use_position_embeddings=True, max_position_embeddings=self.config.max_num_window, dropout_prob=self.config.hidden_dropout_prob) input_shape = [batch_size, seq_length] attention_mask = tf.ones([batch_size, seq_length, seq_length], tf.int32) * tf.expand_dims(use_context, 2) with tf.compat.v1.variable_scope("mid"): prev_output = bc.reshape_to_matrix(input_vectors) for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.layer_list[ layer_idx].apply(prev_output, batch_size, seq_length, attention_mask) final_output = bc.reshape_from_matrix2( prev_output, input_shape) self.all_layer_outputs.append(final_output) return prev_output
def delete_tokens(input_ids, n_trial, shift): delete_location = [] n_block_size = 1 for i in range(n_trial): st = shift + i * n_block_size ed = shift + (i + 1) * n_block_size row = [] for j in range(st, ed): row.append(j) delete_location.append(row) print(delete_location) batch_size, _ = get_shape_list2(input_ids) # [n_trial, 1] delete_location = tf.constant(delete_location, tf.int32) # [1, n_trial, 1] delete_location = tf.expand_dims(delete_location, 0) # [batch_size, n_trial, 1] delete_location = tf.tile(delete_location, [batch_size, 1, 1]) # [n_trial, batch, 1] delete_location = tf.transpose(delete_location, [1, 0, 2]) # [n_trial * batch, 1] delete_location = tf.reshape(delete_location, [batch_size * n_trial, -1]) n_input_ids = tf.tile(input_ids, [n_trial, 1]) masked_input_ids = scatter_with_batch(n_input_ids, delete_location, MASK_ID) return masked_input_ids
def random_masking(input_ids, input_masks, n_sample, mask_token, special_tokens=None): a_seg_len = 459 part_cls = numpy.zeros([1]) part_a_seg = numpy.random.random(a_seg_len) part_remain = numpy.zeros([512 - a_seg_len - 1]) t = numpy.concatenate((part_cls, part_a_seg, part_remain)) batch_size, _ = get_shape_list2(input_ids) base_random = tf.expand_dims(tf.constant(t, tf.float32), 0) rand = tf.tile(base_random, [batch_size, 1]) print(rand.shape) if special_tokens is None: special_tokens = [] rand = remove_special_mask(input_ids, input_masks, rand, special_tokens) _, indice = tf.math.top_k(rand, k=n_sample, sorted=False, name="masking_top_k") masked_lm_positions = indice # [batch, n_samples] masked_lm_ids = gather_index2d(input_ids, masked_lm_positions) masked_lm_weights = tf.ones_like(masked_lm_positions, dtype=tf.float32) masked_input_ids = scatter_with_batch(input_ids, indice, mask_token) return masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
def select_value(a_size, ab_mapping, b_scores, b_items, method, ab_mapping_mask): # [b_size] b_scores = tf.reshape(b_scores, [-1]) b_size = bc.get_shape_list2(b_items)[0] t = tf.reshape(ab_mapping, [-1]) t = tf.cast(t, tf.int32) indice = tf.stack([tf.range(b_size), t], 1) collect_bin = tf.scatter_nd(indice, tf.ones([b_size], tf.float32), [b_size, a_size]) if ab_mapping_mask is not None: collect_bin = collect_bin * tf.cast(tf.transpose(ab_mapping_mask), tf.float32) scattered_score = tf.transpose(tf.expand_dims(b_scores, 1) * collect_bin) # scattered_score : [a_size, b_size], if not corresponding item, the score is zero if method == "max": selected_idx = tf.argmax(scattered_score, axis=1) elif method == "sample": remover = tf.transpose(tf.ones([b_size, a_size]) - collect_bin) * -10000.00 scattered_score += remover selected_idx = categorical_sampling(scattered_score) result = gather(b_items, selected_idx) #[n_items, n_layers, hidden] return result
def cate(): n_sample = 3 alpha = tf.constant(0.5) prob = tf.math.log(tf.constant([[0.5, 0.5, 0.01, 0.3, 0.2], [0.5, 0.5, 0.1, 0.03, 0.2]])) prob = tf.nn.softmax(prob, axis=1) sequence_shape = get_shape_list2(prob) batch_size = sequence_shape[0] seq_length = sequence_shape[1] rand = tf.random.uniform( prob.shape, minval=0, maxval=1, dtype=tf.dtypes.float32, seed=None, name=None ) p1 = tf.ones_like(prob, dtype=tf.float32) / seq_length * alpha p2 = prob * (1-alpha) final_p = p1 + p2 print(prob) print(final_p) _, indice = tf.math.top_k( rand * final_p, k=n_sample, sorted=False, name=None ) print(indice)
def apply_3d(self, input_tensor, batch_size, seq_length, attention_mask): input_shape = bc.get_shape_list2(input_tensor) input_tensor = bc.reshape_to_matrix(input_tensor) intermediate_output, layer_output = self.apply(input_tensor, batch_size, seq_length, attention_mask) return bc.reshape_from_matrix2(layer_output, input_shape)
def extend_input_mask(self, input_mask): input_shape = bc.get_shape_list2(input_mask) batch_size, seq_length = input_shape input_mask = tf.concat( [input_mask, tf.ones([batch_size, self.topic_emb_len], tf.int32)], axis=1) return input_mask
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(MES, self).__init__() combiner = get_combiner(is_training, config) unit_length = config.max_seq_length d_seq_length = config.max_d_seq_length num_window = int(d_seq_length / unit_length) batch_size, _ = get_shape_list2(input_ids) def dense(hidden_size, name): return tf.keras.layers.Dense(hidden_size, activation=tf.keras.activations.tanh, name=name, kernel_initializer=create_initializer( config.initializer_range)) def r2to3(arr): return tf.reshape(arr, [batch_size, num_window, -1]) def r3to4(arr): return tf.reshape(arr, [batch_size, num_window, unit_length, -1]) def get_seq_output_3d(model_class, input_ids, input_masks, segment_ids): # [Batch, num_window, unit_seq_length] stacked_input_ids, stacked_input_mask, stacked_segment_ids = split_input( input_ids, input_masks, segment_ids, d_seq_length, unit_length) model = model_class( config=config, is_training=is_training, input_ids=r3to2(stacked_input_ids), input_mask=r3to2(stacked_input_mask), token_type_ids=r3to2(stacked_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) # [Batch * num_window, seq_length, hidden_size] sequence = model.get_sequence_output() # [Batch, num_window, window_length, hidden_size] return r3to4(sequence) segment_ids = token_type_ids # [Batch, num_window, window_length, hidden_size] seq_output = get_seq_output_3d(BertModel, input_ids, input_mask, segment_ids) print(seq_output) self.pooled_output = combiner(seq_output)
def __init__(self, config, is_training, use_one_hot_embeddings=True, features=None, scope=None): super(MES_pad, self).__init__() input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] label_ids = features["label_ids"] trained_l = config.trained_seq_length data_l = config.data_seq_length batch_size, _ = get_shape_list2(input_ids) add_len = trained_l - data_l zero_pad = tf.zeros([batch_size, add_len], tf.int32) input_ids = tf.concat([input_ids, zero_pad], axis=1) input_mask = tf.concat([input_mask, zero_pad], axis=1) segment_ids = tf.concat([segment_ids, zero_pad], axis=1) # [Batch, unit_seq_length] with tf.compat.v1.variable_scope(dual_model_prefix1): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) pooled = model.get_pooled_output() logits_2d = tf.keras.layers.Dense(2, name="cls_dense")(pooled) # with tf.compat.v1.variable_scope(dual_model_prefix2): model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) logits = tf.keras.layers.Dense(2, name="cls_dense")( model.get_pooled_output()) self.logits = logits label_ids = tf.reshape(label_ids, [-1]) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) layer2_loss = tf.reduce_mean(loss_arr) self.loss = layer2_loss
def network_stacked(self, stacked_input_ids, stacked_input_mask, stacked_segment_ids, use_context): batch_size, num_window, seq_length = bc.get_shape_list2( stacked_input_ids) self.batch_size = batch_size self.num_window = num_window self.lower_module.call( r3to2(stacked_input_ids), r3to2(stacked_input_mask), r3to2(stacked_segment_ids), ) lower_module_last_layer = self.lower_module.all_layer_outputs[ -1] # [ batch_size * num_window, seq_length, hidden_size) input_to_upper = exchange_contexts(batch_size, lower_module_last_layer, num_window, use_context) # input_vectors : [batch_size * num_window, window_length + num_window, hidden_size] added_tokens = num_window attention_mask = tf.pad(self.lower_module.attention_mask, [[0, 0], [0, added_tokens], [0, added_tokens]], 'CONSTANT', constant_values=1) with tf.compat.v1.variable_scope("upper"): for upper_module in self.upper_module_list: self.upper_module_inputs.append(input_to_upper) upper_module.call(input_to_upper, attention_mask) middle_output = upper_module.get_last_layer_output() input_to_upper = exchange_return_context( batch_size, middle_output, self.window_size, num_window, use_context) self.embedding_table = self.lower_module.embedding_layer.embedding_table raw_sequence_output = self.upper_module_list[-1].all_layer_outputs[-1] self.sequence_output = raw_sequence_output[:, :self.window_size, :] self.all_encoder_layers = self.lower_module.all_layer_outputs for upper_module in self.upper_module_list: self.all_encoder_layers.extend(upper_module.all_layer_outputs) self.all_encoder_layers = [] self.embedding_output = self.lower_module.embedding_output if self.pooling == "head": self.pooled_output = self.head_pooling() elif self.pooling == "all": self.pooled_output = self.all_pooling() elif self.pooling == "none": pass return self.sequence_output
def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, tt_input_ids, tt_input_mask, tt_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, tt_input_ids], axis=0) all_input_mask = tf.concat([input_mask, tt_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, tt_segment_ids], axis=0) self.config = config self.lm_batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) initializer = base.create_initializer(config.initializer_range) self.tt_layer = ForwardLayer(config, initializer) self.tt_input_mask = tt_input_mask seq_output = self.model.get_sequence_output()[self.lm_batch_size:] tt_batch_size, seq_length = get_shape_list2(tt_input_ids) tt_attention_mask = create_attention_mask_from_input_mask2( seq_output, self.tt_input_mask) print('tt_attention_mask', tt_attention_mask.shape) print("seq_output", seq_output.shape) seq_output = self.tt_layer.apply_3d(seq_output, tt_batch_size, seq_length, tt_attention_mask) self.tt_feature = mimic_pooling(seq_output, self.config.hidden_size, self.config.initializer_range)
def call(self, input_vectors, attention_mask): prev_output = input_vectors input_shape = bc.get_shape_list2(input_vectors) batch_size, seq_length, _ = input_shape prev_output = bc.reshape_to_matrix(prev_output) for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope( "layer_%d" % (layer_idx + self.layer_idx_base)): layer = self.layer_list[layer_idx] intermediate_output, prev_output = layer.apply( prev_output, batch_size, seq_length, attention_mask) final_output = bc.reshape_from_matrix2(prev_output, input_shape) self.all_layer_outputs.append(final_output) return prev_output
def iterate_over(query, doc, doc_mask, total_doc_len, segment_len, step_size): query_input_mask = tf.ones_like(query, tf.int32) query_segment_ids = tf.zeros_like(query, tf.int32) batch_size, _ = get_shape_list2(query) idx = 0 input_ids_list = [] input_masks_list = [] input_segments_list = [] n_segment = 0 edge_shape = [batch_size, 1] cls_arr = tf.ones(edge_shape, tf.int32) * CLS_ID sep_arr = tf.ones(edge_shape, tf.int32) * SEP_ID edge_one = tf.ones(edge_shape, tf.int32) edge_zero = tf.zeros(edge_shape, tf.int32) while idx < total_doc_len: st = idx ed = idx + segment_len pad_len = ed - total_doc_len if ed > total_doc_len else 0 padding = tf.zeros([batch_size, pad_len], tf.int32) doc_seg_input_ids = tf.concat([doc[:, st:ed], sep_arr, padding], axis=1) doc_seg_input_mask = tf.concat([doc_mask[:, st:ed], edge_one, padding], axis=1) doc_seg_segment_ids = tf.ones_like(doc_seg_input_ids, tf.int32) * doc_seg_input_mask input_ids = tf.concat([cls_arr, query, sep_arr, doc_seg_input_ids], axis=1) input_mask = tf.concat( [edge_one, query_input_mask, edge_one, doc_seg_input_mask], axis=1) segment_ids = tf.concat( [edge_zero, query_segment_ids, edge_zero, doc_seg_segment_ids], axis=1) input_ids_list.append(input_ids) input_masks_list.append(input_mask) input_segments_list.append(segment_ids) idx += step_size n_segment += 1 all_input_ids = tf.concat(input_ids_list, axis=0) all_input_mask = tf.concat(input_masks_list, axis=0) all_segment_ids = tf.concat(input_segments_list, axis=0) print(all_input_ids) return all_input_ids, all_input_mask, all_segment_ids, n_segment
def split_and_append_sep(input_ids, input_mask, segment_ids, seq_length: int, window_length: int, CLS_ID, EOW_ID): special_tokens = 2 # CLS, SEP src_window_length = window_length - special_tokens num_window = int(seq_length / src_window_length) window_input_ids_list = [] window_input_mask_list = [] window_segment_ids_list = [] for window_idx in range(num_window): st = window_idx * src_window_length ed = (window_idx + 1) * src_window_length window_input_ids_list.append(input_ids[:, st:ed]) window_input_mask_list.append(input_mask[:, st:ed]) window_segment_ids_list.append(segment_ids[:, st:ed]) stacked_input_ids = tf.stack( window_input_ids_list, 1) # [batch_size, num_window, src_window_length] stacked_input_mask = tf.stack( window_input_mask_list, 1) # [batch_size, num_window, src_window_length] stacked_segment_ids = tf.stack( window_segment_ids_list, 1) # [batch_size, num_window, src_window_length] batch_size, num_window, _ = bc.get_shape_list2(stacked_input_ids) edge_shape = [batch_size, num_window, 1] cls_arr = tf.ones(edge_shape, tf.int32) * 23 eow_arr = tf.ones(edge_shape, tf.int32) * EOW_ID stacked_input_ids = tf.concat([cls_arr, stacked_input_ids, eow_arr], axis=2) mask_edge = tf.ones(edge_shape, tf.int32) stacked_input_mask = tf.concat([mask_edge, stacked_input_mask, mask_edge], axis=2) edge1 = stacked_segment_ids[:, :, 0:1] edge2 = stacked_segment_ids[:, :, -2:-1] stacked_segment_ids = tf.concat([edge1, stacked_segment_ids, edge2], axis=2) return stacked_input_ids, stacked_input_mask, stacked_segment_ids
def candidate_gen(input_ids, input_mask, segment_ids, n_trial): seed = 0 # draw random interval batch_size, input_len = get_shape_list2(input_ids) indice = draw_starting_point(batch_size, input_len, input_mask, n_trial, seed) flat_indice = tf.reshape(indice, [batch_size*n_trial]) # [ batch_size, m] geo = tfp.distributions.Geometric([0.5]) length_arr = tf.squeeze(tf.cast(geo.sample(indice.shape) + 1, tf.int32), 2) length_arr_flat = tf.reshape(length_arr, [-1]) new_input_ids = drop_middle(batch_size, flat_indice, input_ids, input_len, length_arr_flat, n_trial) new_segment_ids = drop_middle(batch_size, flat_indice, segment_ids, input_len, length_arr_flat, n_trial) new_input_mask = drop_middle(batch_size, flat_indice, input_mask, input_len, length_arr_flat, n_trial) return new_input_ids, new_segment_ids, new_input_mask, indice, length_arr
def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, nli_input_ids, nli_input_mask, nli_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, nli_input_ids], axis=0) all_input_mask = tf.concat([input_mask, nli_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, nli_segment_ids], axis=0) self.batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings)
def tlm2(bert_config, use_one_hot_embeddings, features): input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] hp = hyperparams.HPBert() voca_size = 30522 sequence_shape = bert_common.get_shape_list2(input_ids) encode_model = BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) loss_model = IndependentLossModel(bert_config) loss_model.build_predictions(encode_model.get_sequence_output()) output = -(loss_model.prob1 - loss_model.prob2) return output
def call(self, stacked_input_ids, stacked_input_mask, stacked_segment_ids, use_context): self.lower_module.call( r3to2(stacked_input_ids), r3to2(stacked_input_mask), r3to2(stacked_segment_ids), ) lower_module_last_layer = self.lower_module.all_layer_outputs[ -1] # [ batch_size * num_window, seq_length, hidden_size) window_vectors = lower_module_last_layer[:, -1, :] batch_size, num_window, seq_length = bc.get_shape_list2( stacked_input_ids) window_vectors = tf.reshape(window_vectors, [batch_size, num_window, -1]) context_vectors = self.mid_layers.call( window_vectors, use_context) # [batch_size, num_window, hidden_size ] context_vectors = tf.reshape(context_vectors, [batch_size * num_window, 1, -1]) input_vectors = tf.concat([lower_module_last_layer, context_vectors], axis=1) added_tokens = 1 attention_mask = tf.pad(self.lower_module.attention_mask, [[0, 0], [0, added_tokens], [0, added_tokens]], 'CONSTANT', constant_values=1) with tf.compat.v1.variable_scope("upper"): self.upper_module.call(input_vectors, attention_mask) self.embedding_table = self.lower_module.embedding_layer.embedding_table self.sequence_output = self.upper_module.all_layer_outputs[-1] self.all_encoder_layers = self.lower_module.all_layer_outputs + self.upper_module.all_layer_outputs self.all_encoder_layers = [] self.embedding_output = self.lower_module.embedding_output return self.sequence_output
def call(self, input_ids, input_mask, segment_ids, topic_ids): with tf.compat.v1.variable_scope("embeddings"): self.embedding_layer = Embedding(self.config, self.use_one_hot_embeddings) input_tensor = self.embedding_layer.apply(input_ids, segment_ids) self.embedding_output = input_tensor input_mask = self.extend_input_mask(input_mask) topic_tensor, _ = bc.embedding_lookup2(topic_ids, self.n_topics, self.topic_embedding, self.topic_embedding_size, self.use_one_hot_embeddings) self.topic_tensor = tf.reshape( topic_tensor, [-1, self.topic_emb_len, self.hidden_size]) input_tensor = tf.concat([input_tensor, self.topic_tensor], axis=1) input_shape = bc.get_shape_list2(input_tensor) batch_size, seq_length, _ = input_shape with tf.compat.v1.variable_scope("encoder"): self.attention_mask = bc.create_attention_mask_from_input_mask2( input_tensor, input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer = self.layer_list[layer_idx] intermediate_output, prev_output = layer.apply( prev_output, batch_size, seq_length, self.attention_mask) final_output = bc.reshape_from_matrix2( prev_output, input_shape) self.all_layer_outputs.append(final_output) self.embedding_table = self.embedding_layer.embedding_table self.sequence_output = final_output[:, :-self.topic_emb_len] self.pooled_output = mimic_pooling(self.sequence_output, self.config.hidden_size, self.config.initializer_range) return self.sequence_output
def call(self, stacked_input_ids, stacked_input_mask, stacked_segment_ids, use_context): batch_size, num_window, seq_length = bc.get_shape_list2( stacked_input_ids) self.lower_module.call( r3to2(stacked_input_ids), r3to2(stacked_input_mask), r3to2(stacked_segment_ids), ) lower_module_last_layer = self.lower_module.all_layer_outputs[ -1] # [ batch_size * num_window, seq_length, hidden_size) input_vectors = exchange_contexts(batch_size, lower_module_last_layer, num_window, use_context) # input_vectors : [batch_size * num_window, window_length + num_window, hidden_size] added_tokens = num_window attention_mask = tf.pad(self.lower_module.attention_mask, [[0, 0], [0, added_tokens], [0, added_tokens]], 'CONSTANT', constant_values=1) with tf.compat.v1.variable_scope("mid"): self.mid_layers.call(input_vectors, attention_mask) middle_output = self.mid_layers.get_last_layer_output() input_to_upper = exchange_return_context(batch_size, middle_output, self.window_size, num_window, use_context) with tf.compat.v1.variable_scope("upper"): self.upper_module.call(input_to_upper, attention_mask) self.embedding_table = self.lower_module.embedding_layer.embedding_table raw_sequence_output = self.upper_module.all_layer_outputs[-1] self.sequence_output = raw_sequence_output[:, :self.window_size, :] self.all_encoder_layers = self.lower_module.all_layer_outputs + self.upper_module.all_layer_outputs self.all_encoder_layers = [] self.embedding_output = self.lower_module.embedding_output return self.sequence_output
def call(self, input_ids, input_mask, segment_ids): n_added_tokens = self.num_column_tokens * self.num_columns input_ids = input_ids[:, :-n_added_tokens] input_mask = input_mask[:, :-n_added_tokens] segment_ids = segment_ids[:, :-n_added_tokens] input_tensor = self.embedding_layer.apply( input_ids, segment_ids, self.config.initializer_range, self.config.vocab_size, self.config.embedding_size, self.config.type_vocab_size, self.config.max_position_embeddings, self.config.hidden_dropout_prob, self.use_one_hot_embeddings) self.embedding_output = input_tensor input_tensor = self.embedding_projector( input_tensor) # [ batch_size, seq_len, hidden_dim ] batch_size, _, _ = get_shape_list2(input_tensor) tensor_list = [input_tensor] + self.get_column_embeddings(batch_size) tensor_list = [Tensor2D(t) for t in tensor_list] to_tensor_mask = self.get_to_tensor_mask(batch_size, input_mask) for layer_no in range(self.num_layers): with tf.compat.v1.variable_scope("layer", reuse=layer_no > 0): tensor_list = self.forward(tensor_list, to_tensor_mask) self.all_raw_layers.append(tensor_list) self.all_main_layers.append(tensor_list[0]) self.embedding_table = self.embedding_layer.embedding_table last_main_tensor = self.all_main_layers[-1] self.sequence_output = last_main_tensor.get_3d() self.sequence_output = tf.concat([ self.sequence_output, tf.zeros([batch_size, n_added_tokens, self.config.hidden_size]) ], axis=1) self.pooled_output = mimic_pooling(self.sequence_output, self.config.hidden_size, self.config.initializer_range) return self.sequence_output
def call(self, input_ids, input_mask, segment_ids): with tf.compat.v1.variable_scope("embeddings"): self.embedding_layer = Embedding(self.config, self.use_one_hot_embeddings) input_tensor = self.embedding_layer.apply(input_ids, segment_ids) self.embedding_output = input_tensor input_shape = bc.get_shape_list2(input_tensor) batch_size, seq_length, _ = input_shape with tf.compat.v1.variable_scope("lower"): self.attention_mask = bc.create_attention_mask_from_input_mask2( input_tensor, input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer = self.layer_list[layer_idx] intermediate_output, prev_output = layer.apply( prev_output, batch_size, seq_length, self.attention_mask) final_output = bc.reshape_from_matrix2( prev_output, input_shape) self.all_layer_outputs.append(final_output) return prev_output
def network_stacked(self, stacked_input_ids, stacked_input_mask, stacked_segment_ids, use_context): batch_size, num_window, seq_length = bc.get_shape_list2( stacked_input_ids) self.lower_module.call( r3to2(stacked_input_ids), r3to2(stacked_input_mask), r3to2(stacked_segment_ids), ) lower_module_last_layer = self.lower_module.all_layer_outputs[-1] #[ batch_size * num_window, seq_length, hidden_size) lower_module_last_layer = tf.reshape( lower_module_last_layer, [batch_size, num_window, seq_length, -1]) self.pooled_output = self.combine_model.call(lower_module_last_layer) print(self.pooled_output) self.embedding_table = self.lower_module.embedding_layer.embedding_table self.sequence_output = lower_module_last_layer self.all_encoder_layers = self.lower_module.all_layer_outputs self.embedding_output = self.lower_module.embedding_output return self.sequence_output
def get_dummy_next_sentence_labels(input_ids): sequence_shape = bert_common.get_shape_list2(input_ids) batch_size = sequence_shape[0] next_sentence_labels = tf.zeros([batch_size, 1], tf.int64) return next_sentence_labels
def attention_layer(from_tensor: Tensor2D, to_tensor_list: List[Tensor2D], query_ff, key_ff, value_ff, attention_mask=None, num_attention_heads=1, size_per_head=512, attention_probs_dropout_prob=0.0): def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width], name="reshape_transpose_for_scores") output_tensor = tf.transpose(a=output_tensor, perm=[0, 2, 1, 3]) return output_tensor from_shape = get_shape_list2(from_tensor.matrix) for to_tensor in to_tensor_list: to_shape = get_shape_list2(to_tensor.matrix) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`." ) # `query_layer` = [B*F, N*H] query_layer = query_ff(from_tensor.matrix) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores(query_layer, from_tensor.batch_size, num_attention_heads, from_tensor.seq_length, size_per_head) key_layer_list = [] value_layer_list = [] for to_tensor in to_tensor_list: # `key_layer` = [B*T, N*H] key_layer = key_ff(to_tensor.matrix) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores(key_layer, to_tensor.batch_size, num_attention_heads, to_tensor.seq_length, size_per_head) key_layer_list.append(key_layer) # `value_layer` = [B*T, N*H] value_layer = value_ff(to_tensor.matrix) # `value_layer` = [B, T, N, H] value_layer = tf.reshape(value_layer, [ to_tensor.batch_size, to_tensor.seq_length, num_attention_heads, size_per_head ], name="value_reshape") # `value_layer` = [B, N, T, H] value_layer = tf.transpose(a=value_layer, perm=[0, 2, 1, 3]) value_layer_list.append(value_layer) key_layer_all = tf.concat(key_layer_list, axis=2) value_layer_all = tf.concat(value_layer_list, axis=2) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] attention_scores = tf.matmul(query_layer, key_layer_all, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. # TODO restore this # attention_probs = dropout(attention_probs, attention_probs_dropout_prob) # `context_layer` = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer_all) # `context_layer` = [B, F, N, H] context_layer = tf.transpose(a=context_layer, perm=[0, 2, 1, 3]) # `context_layer` = [B*F, N*V] context_layer = tf.reshape(context_layer, [ from_tensor.batch_size * from_tensor.seq_length, num_attention_heads * size_per_head ]) return context_layer