def create_mask(inputs, state, equal_window): """Creates mask for future sequence positions. Args: inputs: inputs tensor of shape [B, N, D] state: optional tensor of shape [B, M, D], CompressedMemoryState or a list where the ith entry corresponds to the ith layer's state. equal_window: if True, then each activation has an equally-sized attention window of length 'M'. This only makes sense if a state is given. Returns: Float tensor of shape [1, 1, N, N + M], to be summed with logits. """ chunk_size = inputs.get_shape().as_list()[1] dtype = inputs.dtype mask = future_mask(chunk_size, dtype) if state is not None: if isinstance(state, (tuple, list)): largest_memory_layer = np.argmax([_memory_size(s) for s in state]) state = state[largest_memory_layer] mem_size = _memory_size(state) mask = tf.concat( [tf.zeros([1, 1, chunk_size, mem_size], dtype=dtype), mask], 3) if equal_window: attn_mask = tf.ones([chunk_size, chunk_size], dtype=dtype) mask_dia = tf.cast(tf.matrix_band_part(attn_mask, 0, 0), dtype=dtype) mask_l = tf.cast(tf.matrix_band_part(attn_mask, -1, 0), dtype=dtype) start_mask = tf.reshape(mask_l - mask_dia, [1, 1, chunk_size, chunk_size]) * -1e6 mask = tf.concat([ mask[:, :, :, :chunk_size] + start_mask, mask[:, :, :, chunk_size:] ], 3) return mask
def get_mask_init(): ones = tf.ones([n_1x1_heads, n_channels, n_channels], dtype=dtype) l_mask = tf.matrix_band_part(ones, -1, 0) - tf.matrix_band_part( ones, 0, 0) u_mask = tf.matrix_band_part(ones, 0, -1) - tf.matrix_band_part( ones, 0, 0) return tf.stack([l_mask, u_mask], axis=0)
def future_mask(chunk_size, dtype): """Creates attention mask to ensure an element i cannot attend to j > i.""" square = tf.ones([chunk_size, chunk_size], dtype=dtype) # Create upper diagonal matrix and remove diagonal entries (allow self-attn). mask = tf.matrix_band_part(square, 0, -1) - tf.matrix_band_part( square, 0, 0) # Multiply by -1e6 and expand to broadcast with [B, H, N, N] logits. mask = -1e6 * tf.reshape(mask, [1, 1, chunk_size, chunk_size]) return mask
def _create_mask(qlen, mlen, dtype=tf.float32, same_length=False): """create causal attention mask.""" attn_mask = tf.ones([qlen, qlen], dtype=dtype) mask_u = tf.matrix_band_part(attn_mask, 0, -1) mask_dia = tf.matrix_band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype) ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if same_length: mask_l = tf.matrix_band_part(attn_mask, -1, 0) ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) return ret
def __call__(self, inputs_BxIxH, inputs_padding_BxI, targets_BxTxH, training, targets_start=None): # Mask off padding in inputs when computing attention. inputs_attention_bias_Bx1x1xI = tf.expand_dims( tf.expand_dims(inputs_padding_BxI * inputs_padding_BxI.dtype.min, 1), 1) # Mask off "future" targets to avoid them creeping into predictions when # computing loss over an entire targets matrix. targets_len = tf.shape(targets_BxTxH)[1] upper_triangular_TxT = 1 - tf.matrix_band_part( tf.ones((targets_len, targets_len), dtype=inputs_BxIxH.dtype), -1, 0) decoder_self_attention_bias_1x1xTxT = tf.expand_dims( tf.expand_dims(upper_triangular_TxT, 0), 0) * inputs_BxIxH.dtype.min # Pad a zero on the LHS of targets. decoder_input_BxTxH = tf.pad( targets_BxTxH, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] decoder_input_BxTxH = TimingSignal()(decoder_input_BxTxH, targets_start) decoder_input_BxTxH = tf.layers.dropout( decoder_input_BxTxH, self._postprocess_dropout, training=training) x_BxTxH = decoder_input_BxTxH with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): for i, decoder_layer in enumerate(self._decoder_layers): with tf.variable_scope("layer_%d" % i): x_BxTxH = decoder_layer(x_BxTxH, decoder_self_attention_bias_1x1xTxT, inputs_BxIxH, inputs_attention_bias_Bx1x1xI, training) decoder_output_BxTxH = self._preprocess_layer(x_BxTxH) return decoder_output_BxTxH
def _create_mask(qlen, mlen, same_length=False): attn_mask = tf.ones([qlen, qlen]) mask_u = tf.linalg.band_part(attn_mask, 0, -1) mask_dia = tf.linalg.band_part(attn_mask, 0, 0) attn_mask_pad = tf.zeros([qlen, mlen]) ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if same_length: mask_l = tf.matrix_band_part(attn_mask, -1, 0) ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) return ret
def cindex_score(y_true, y_pred): g = tf.subtract(tf.expand_dims(y_pred, -1), y_pred) g = tf.cast(g == 0.0, tf.float32) * 0.5 + tf.cast(g > 0.0, tf.float32) f = tf.subtract(tf.expand_dims(y_true, -1), y_true) > 0.0 f = tf.matrix_band_part(tf.cast(f, tf.float32), -1, 0) g = tf.reduce_sum(tf.multiply(g, f)) f = tf.reduce_sum(f) return tf.where(tf.equal(g, 0), 0.0, g / f) #select
def compute_attention_mask(token_mask, input_mask): """Compute attention mask.""" batch_size = tensor_utils.shape(token_mask, 0) num_tokens = tensor_utils.shape(token_mask, 1) token_to_token = tf.ones([batch_size, num_tokens, num_tokens], dtype=tf.int32) token_to_token = tf.matrix_band_part(token_to_token, -1, 0) if input_mask is not None: token_to_input = tf.expand_dims(input_mask, 1) token_to_input = tf.tile(token_to_input, [1, num_tokens, 1]) attention_mask = tf.concat([token_to_input, token_to_token], axis=-1) else: attention_mask = token_to_token return attention_mask
def max_scoring_span(start_scores, end_scores, max_length, no_answer_bias=0): """Compute max scoring span, using the sum of start and end scores. Args: start_scores: <float32> [batch_size, seq_len] end_scores: <float32> [batch_size, seq_len] max_length: <int32> Max answer length. no_answer_bias: <float32> Log-odds threshold for "no-answer" selection. I.e. if log p(span=i,j)/p(span=NULL) > no_answer_bias, then select i, j as the span, and NULL otherwise. Returns: start: <int32> [batch_size] end: <int32> [batch_size] """ # Create sparse tensor of size [seq_len]. seq_len = tensor_utils.shape(start_scores, -1) no_answer_bias = tf.scatter_nd([[0]], [no_answer_bias], [seq_len]) no_answer_bias = tf.cast(no_answer_bias, tf.float32) # Apply bias to CLS token logits. no_answer_bias = tf.div(no_answer_bias, 2) start_scores += tf.expand_dims(no_answer_bias, 0) end_scores += tf.expand_dims(no_answer_bias, 0) # Compute outer sum, and mask to be upper triangular. # This gives a matrix of start[i] + end[j] scores, where j >= i. scores = tf.expand_dims(start_scores, 2) + tf.expand_dims(end_scores, 1) mask = (1 - tf.matrix_band_part(tf.ones_like(scores), 0, max_length - 1)) scores -= mask * 1e-4 def map_fn(inputs): flattened = tf.reshape(inputs, [-1]) argmax = tf.argmax(flattened, output_type=tf.int32) indices = tensor_utils.unravel_index_2d(argmax, inputs.shape) score = flattened[argmax] return indices, score # Return i, j indices of max-scoring entry. with tf.device("/cpu"): endpoints, span_scores = tf.map_fn(fn=map_fn, elems=scores, dtype=(tf.int32, tf.float32)) start = endpoints[:, 0] end = endpoints[:, 1] return start, end, span_scores
def __call__(self, inputs_BxTxH, training, targets_start=None): """TransformerDecoderOnly call operator. Args: inputs_BxTxH: a 3d-tensor representing decoder inputs; during training essentially the right-shifted decoder targets training: bool indicating whether we are in training mode targets_start: starting index for adding position information Returns: 3d-tensor of shape [B, T, H] representing decoder outputs """ # Mask off "future" targets to avoid them creeping into predictions when # computing loss over an entire targets matrix. targets_len = tf.shape(inputs_BxTxH)[1] upper_triangular_TxT = 1 - tf.matrix_band_part( tf.ones((targets_len, targets_len), dtype=tf.float32), -1, 0) # For example, when targets_len == 3, upper_triangular_TxT is: # [[0., 1., 1.], # [0., 0., 1.], # [0., 0., 0.]] masked_attention_bias_1x1xTxT = tf.expand_dims( tf.expand_dims(upper_triangular_TxT, 0), 0) * tf.float32.min # For example, masked_attention_bias_1x1xTxT is: # [[[[0., -inf, -inf], # [0., 0., -inf], # [0., 0., 0.]]]] # No padding is needed here as inputs_BxTxH is already prepended with # special token. decoder_input_BxTxH = TimingSignal()(inputs_BxTxH, targets_start) decoder_input_BxTxH = tf.layers.dropout(decoder_input_BxTxH, self._postprocess_dropout, training=training) x_BxTxH = decoder_input_BxTxH with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): for i, decoder_layer in enumerate(self._decoder_layers): with tf.variable_scope("layer_%d" % i): x_BxTxH = decoder_layer(x_BxTxH, masked_attention_bias_1x1xTxT, training) decoder_output_BxTxH = self._preprocess_layer(x_BxTxH) return decoder_output_BxTxH
def __init__(self, posts, **kwargs): FactorisedPosterior.__init__(self, posts, **kwargs) # The full covariance matrix is formed from the Cholesky decomposition # to ensure that it remains positive definite. # # To achieve this, we have to create PxP tensor variables for # each parameter vertex, but we then extract only the lower triangular # elements and train only on these. The diagonal elements # are constructed by the FactorisedPosterior if kwargs.get("init", None): # We are initializing from an existing posterior. # The FactorizedPosterior will already have extracted the mean and # diagonal of the covariance matrix - we need the Cholesky decomposition # of the covariance to initialize the off-diagonal terms self.log.info(" - Initializing posterior covariance from input posterior") _mean, cov = kwargs["init"] covar_init = tf.cholesky(cov) else: covar_init = tf.zeros([self.nvertices, self.nparams, self.nparams], dtype=tf.float32) self.off_diag_vars_base = self.log_tf(tf.Variable(covar_init, validate_shape=False, name='%s_off_diag_vars' % self.name)) if kwargs.get("suppress_nan", True): self.off_diag_vars = tf.where(tf.is_nan(self.off_diag_vars_base), tf.zeros_like(self.off_diag_vars_base), self.off_diag_vars_base) else: self.off_diag_vars = self.off_diag_vars_base self.off_diag_cov_chol = tf.matrix_set_diag(tf.matrix_band_part(self.off_diag_vars, -1, 0), tf.zeros([self.nvertices, self.nparams]), name='%s_off_diag_cov_chol' % self.name) # Combine diagonal and off-diagonal elements into full matrix self.cov_chol = tf.add(tf.matrix_diag(self.std), self.off_diag_cov_chol, name='%s_cov_chol' % self.name) # Form the covariance matrix from the chol decomposition self.cov = tf.matmul(tf.transpose(self.cov_chol, perm=(0, 2, 1)), self.cov_chol, name='%s_cov' % self.name) self.cov_chol = self.log_tf(self.cov_chol) self.cov = self.log_tf(self.cov)
def group_v2_deconv_decoder(latent_tensor, output_shape, hy_ncut=1, group_feats_size=gin.REQUIRED, lie_alg_init_scale=gin.REQUIRED, lie_alg_init_type=gin.REQUIRED, n_act_points=gin.REQUIRED, is_training=True): """Convolutional decoder used in beta-VAE paper for the chairs data. Based on row 3 of Table 1 on page 13 of "beta-VAE: Learning Basic Visual Concepts with a Constrained Variational Framework" (https://openreview.net/forum?id=Sy2fzU9gl) Here we add an extra linear mapping for group features extraction. Args: latent_tensor: Input tensor of shape (batch_size,) to connect decoder to. output_shape: Shape of the data. group_feats_size: The dimension of group features. is_training: Whether or not the graph is built for training (UNUSED). Returns: Output tensor of shape (batch_size, 64, 64, num_channels) with the [0,1] pixel intensities. group_feats: Group features. """ # del is_training lie_alg_basis_ls = [] latent_dim = latent_tensor.get_shape().as_list()[-1] latents_in_cut_ls = split_latents(latent_tensor, hy_ncut=hy_ncut) # [x0, x1] mat_dim = int(math.sqrt(group_feats_size)) for i in range(latent_dim): init = tf.initializers.random_normal(0, lie_alg_init_scale) lie_alg_tmp = tf.get_variable('lie_alg_' + str(i), shape=[1, mat_dim, mat_dim], initializer=init) if lie_alg_init_type == 'oth': lie_alg_tmp = tf.matrix_band_part(lie_alg_tmp, 0, -1) lie_alg_tmp = lie_alg_tmp - tf.transpose(lie_alg_tmp, perm=[0, 2, 1]) lie_alg_basis_ls.append(lie_alg_tmp) lie_alg_basis = tf.concat(lie_alg_basis_ls, axis=0)[tf.newaxis, ...] # [1, lat_dim, mat_dim, mat_dim] lie_alg = 0 lie_group = tf.eye(mat_dim, dtype=lie_alg_basis_ls[0].dtype)[tf.newaxis, ...] for i, lie_alg_basis_i in enumerate(lie_alg_basis_ls): lie_alg_tmp = lie_alg_basis_i * latent_tensor[:, i][..., tf.newaxis, tf.newaxis] lie_alg = lie_alg + lie_alg_tmp lie_group_tmp = tf.linalg.expm(lie_alg_tmp) # [b, mat_dim, mat_dim] lie_group = tf.matmul(lie_group_tmp, lie_group) # if not is_training: # lie_alg_mul = latent_tensor[ # ..., tf.newaxis, tf. # newaxis] * lie_alg_basis # [b, lat_dim, mat_dim, mat_dim] # lie_alg = tf.reduce_sum(lie_alg_mul, axis=1) # [b, mat_dim, mat_dim] # lie_group = tf.linalg.expm(lie_alg) # [b, mat_dim, mat_dim] # else: # lie_group = tf.eye( # mat_dim, # dtype=latents_in_cut_ls[0].dtype)[tf.newaxis, ...] # lie_alg = 0 # for latents_in_cut_i in latents_in_cut_ls: # lie_alg_mul_tmp = latents_in_cut_i[ # ..., tf.newaxis, tf.newaxis] * lie_alg_basis # [b, lat_dim, mat_dim, mat_dim] # lie_alg_tmp = tf.reduce_sum( # lie_alg_mul_tmp, # axis=1) # [b, mat_dim, mat_dim] # lie_alg = lie_alg + lie_alg_tmp # lie_group_tmp = tf.linalg.expm( # lie_alg_tmp) # [b, mat_dim, mat_dim] # lie_group = tf.matmul(lie_group, # lie_group_tmp) transed_act_points_tensor = tf.reshape(lie_group, [-1, mat_dim * mat_dim]) # lie_alg_mul = latent_tensor[ # ..., tf.newaxis, tf. # newaxis] * lie_alg_basis # [b, lat_dim, mat_dim, mat_dim] # lie_alg = tf.reduce_sum(lie_alg_mul, axis=1) # [b, mat_dim, mat_dim] # lie_group = tf.linalg.expm(lie_alg) # [b, mat_dim, mat_dim] # act_init = tf.initializers.random_normal(0, 0.01) # act_points = tf.get_variable('act_points', # shape=[1, mat_dim, n_act_points], # initializer=act_init) # transed_act_points = tf.matmul(lie_group, act_points) # transed_act_points_tensor = tf.reshape(transed_act_points, # [-1, mat_dim * n_act_points]) d1 = tf.layers.dense(transed_act_points_tensor, 256, activation=tf.nn.relu) d2 = tf.layers.dense(d1, 1024, activation=tf.nn.relu) d2_reshaped = tf.reshape(d2, shape=[-1, 4, 4, 64]) d3 = tf.layers.conv2d_transpose( inputs=d2_reshaped, filters=64, kernel_size=4, strides=2, activation=tf.nn.relu, padding="same", ) d4 = tf.layers.conv2d_transpose( inputs=d3, filters=32, kernel_size=4, strides=2, activation=tf.nn.relu, padding="same", ) d5 = tf.layers.conv2d_transpose( inputs=d4, filters=32, kernel_size=4, strides=2, activation=tf.nn.relu, padding="same", ) d6 = tf.layers.conv2d_transpose( inputs=d5, filters=output_shape[2], kernel_size=4, strides=2, padding="same", ) return tf.reshape(d6, [-1] + output_shape), lie_group, lie_alg_basis
def __init__(self, lr, batch_size, dimension, util_train, util_test, campaign, reg_lambda, sigma): # hyperparameters self.lr = lr self.batch_size = batch_size self.util_train = util_train self.util_test = util_test self.reg_lambda = reg_lambda self.sigma = sigma self.emb_size = 20 self.train_data_amt = util_train.get_data_amt() self.test_data_amt = util_test.get_data_amt() # output dir model_name = "{}_{}_{}_{}".format(self.lr, self.reg_lambda, self.batch_size, self.sigma) self.output_dir = "output/deephit/{}/{}/".format(campaign, model_name) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # reset graph tf.reset_default_graph() # field params self.field_sizes = self.util_train.feat_sizes self.field_num = len(self.field_sizes) # placeholders self.X = [tf.sparse_placeholder(tf.float64) for i in range(0, self.field_num)] self.z = tf.placeholder(tf.float64) self.b = tf.placeholder(tf.float64) self.y = tf.placeholder(tf.float64) # embedding layer self.var_map = {} # for truncated self.var_map['embed_0'] = tf.Variable( tf.truncated_normal([self.field_sizes[0], 1], dtype=tf.float64)) for i in range(1, self.field_num): self.var_map['embed_%d' % i] = tf.Variable( tf.truncated_normal([self.field_sizes[i], self.emb_size], dtype=tf.float64)) # after embedding w0 = [self.var_map['embed_%d' % i] for i in range(self.field_num)] self.dense_input = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(self.field_num)], 1) # shared network self.hidden1 = tf.Variable(initial_value=tf.truncated_normal(shape=[(self.field_num - 1) * self.emb_size + 1, HIDDEN_SIZE1], dtype=tf.float64), name='h1') self.out1 = tf.Variable(initial_value=tf.truncated_normal(shape=[HIDDEN_SIZE1, OUT_SIZE1], dtype=tf.float64), name='o1') self.hidden2 = tf.Variable(initial_value=tf.truncated_normal(shape=[OUT_SIZE1, HIDDEN_SIZE2], dtype=tf.float64), name='h2') self.out2 = tf.Variable(initial_value=tf.truncated_normal(shape=[HIDDEN_SIZE2, OUT_SIZE2], dtype=tf.float64), name='o2') # cause-specific network self.hidden1_val = tf.nn.relu(tf.matmul(self.dense_input, self.hidden1)) self.out1_val = tf.sigmoid(tf.matmul(self.hidden1_val, self.out1)) self.hidden2_val = tf.nn.relu(tf.matmul(self.out1_val, self.hidden2)) self.out2_val = tf.sigmoid(tf.matmul(self.hidden2_val, self.out2)) # p_z and w_b self.p = tf.nn.softmax(self.out2_val) self.w = tf.cumsum(self.p, exclusive=True, axis = 1) idx_z = tf.stack([tf.reshape(tf.range(tf.shape(self.z)[0]), (-1,1)), tf.cast(self.z - 1, tf.int32)], axis=-1) idx_b = tf.stack([tf.reshape(tf.range(tf.shape(self.b)[0]), (-1,1)), tf.cast(self.b - 1, tf.int32)], axis=-1) self.pz = tf.gather_nd(self.p, idx_z) self.wb = tf.gather_nd(self.w, idx_b) self.wz = tf.gather_nd(self.w, idx_z) # loss and train step self.loss1 = -tf.reduce_sum(tf.log(tf.clip_by_value(self.pz, 1e-8, 1.0)) * self.y) self.loss2 = -tf.reduce_sum(tf.log(tf.clip_by_value(1 - self.wb, 1e-8, 1.0)) * (1 - self.y)) self.reg_loss = tf.nn.l2_loss(self.hidden1[1:,]) + tf.nn.l2_loss(self.hidden2[1:,]) + \ tf.nn.l2_loss(self.out1[1:,]) + tf.nn.l2_loss(self.out2[1:,]) # get ranking loss self.w_of_pair = tf.transpose(tf.nn.embedding_lookup(tf.transpose(self.w), tf.cast(self.z[:,0] - 1, tf.int32))) self.w_of_self = tf.reshape(tf.tile(tf.reshape(self.wz, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size)) self.win_label = tf.reshape(tf.tile(tf.reshape(self.y, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size)) self.delta = self.w_of_self - self.w_of_pair self.candidate = tf.exp(-self.delta / self.sigma) self.rank_loss = tf.reduce_sum(tf.matrix_band_part(self.candidate, -1, 0) * self.win_label) self.loss = self.loss1 + self.loss2 + self.reg_lambda * self.reg_loss + self.rank_loss self.optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_step = self.optimizer.minimize(self.loss) # session initialization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
def __init__( self, bert_config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None, embedding_size=None, input_embeddings=None, input_reprs=None, update_embeddings=True, untied_embeddings=False, ltr=False, rtl=False, ): """Constructor for BertModel. Args: bert_config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "electra". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ bert_config = copy.deepcopy(bert_config) if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(token_type_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) assert token_type_ids is not None if input_reprs is None: if input_embeddings is None: with tf.variable_scope( (scope if untied_embeddings else 'electra') + '/embeddings', reuse=tf.AUTO_REUSE, ): # Perform embedding lookup on the word ids if embedding_size is None: embedding_size = bert_config.hidden_size ( self.token_embeddings, self.embedding_table, ) = embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, embedding_size=embedding_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=use_one_hot_embeddings, ) else: self.token_embeddings = input_embeddings with tf.variable_scope( (scope if untied_embeddings else 'electra') + '/embeddings', reuse=tf.AUTO_REUSE, ): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.token_embeddings, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=bert_config. max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, ) else: self.embedding_output = input_reprs if not update_embeddings: self.embedding_output = tf.stop_gradient(self.embedding_output) with tf.variable_scope(scope, default_name='electra'): if self.embedding_output.shape[-1] != bert_config.hidden_size: self.embedding_output = tf.layers.dense( self.embedding_output, bert_config.hidden_size, name='embeddings_project', ) with tf.variable_scope('encoder'): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( token_type_ids, input_mask) # Add causal masking to the attention for running the transformer # left-to-right or right-to-left if ltr or rtl: causal_mask = tf.ones((seq_length, seq_length)) if ltr: causal_mask = tf.matrix_band_part(causal_mask, -1, 0) else: causal_mask = tf.matrix_band_part(causal_mask, 0, -1) attention_mask *= tf.expand_dims(causal_mask, 0) # Run the stacked transformer. Output shapes # sequence_output: [batch_size, seq_length, hidden_size] # pooled_output: [batch_size, hidden_size] # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size]. # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length] (self.all_layer_outputs, self.attn_maps) = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config. attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=True, ) self.sequence_output = self.all_layer_outputs[-1] self.pooled_output = self.sequence_output[:, 0]
def mask_attn_weights(w): n = shape_list(w)[-1] b = tf.matrix_band_part(tf.ones([n, n]), -1, 0) b = tf.reshape(b, [1, 1, n, n]) w = w*b + -1e9*(1-b) return w
def _constrain_prob_mat(prob_mat, max_answer_size): """Constraint prob mat such that start <= end < start + max_answer_size.""" # prob_mat has shape [batch, doc_len, doc_len] max_x_len = tf.shape(prob_mat)[1] max_answer_length = tf.to_int64(tf.minimum(max_x_len, max_answer_size)) return tf.matrix_band_part(prob_mat, 0, max_answer_length - 1)