def create_mask(inputs, state, equal_window):
    """Creates mask for future sequence positions.

  Args:
    inputs: inputs tensor of shape [B, N, D]
    state: optional tensor of shape [B, M, D], CompressedMemoryState or a list
      where the ith entry corresponds to the ith layer's state.
    equal_window: if True, then each activation has an equally-sized attention
      window of length 'M'. This only makes sense if a state is given.

  Returns:
    Float tensor of shape [1, 1, N, N + M], to be summed with logits.
  """
    chunk_size = inputs.get_shape().as_list()[1]
    dtype = inputs.dtype
    mask = future_mask(chunk_size, dtype)
    if state is not None:
        if isinstance(state, (tuple, list)):
            largest_memory_layer = np.argmax([_memory_size(s) for s in state])
            state = state[largest_memory_layer]
        mem_size = _memory_size(state)
        mask = tf.concat(
            [tf.zeros([1, 1, chunk_size, mem_size], dtype=dtype), mask], 3)

    if equal_window:
        attn_mask = tf.ones([chunk_size, chunk_size], dtype=dtype)
        mask_dia = tf.cast(tf.matrix_band_part(attn_mask, 0, 0), dtype=dtype)
        mask_l = tf.cast(tf.matrix_band_part(attn_mask, -1, 0), dtype=dtype)
        start_mask = tf.reshape(mask_l - mask_dia,
                                [1, 1, chunk_size, chunk_size]) * -1e6
        mask = tf.concat([
            mask[:, :, :, :chunk_size] + start_mask, mask[:, :, :, chunk_size:]
        ], 3)
    return mask
 def get_mask_init():
     ones = tf.ones([n_1x1_heads, n_channels, n_channels], dtype=dtype)
     l_mask = tf.matrix_band_part(ones, -1, 0) - tf.matrix_band_part(
         ones, 0, 0)
     u_mask = tf.matrix_band_part(ones, 0, -1) - tf.matrix_band_part(
         ones, 0, 0)
     return tf.stack([l_mask, u_mask], axis=0)
def future_mask(chunk_size, dtype):
    """Creates attention mask to ensure an element i cannot attend to j > i."""
    square = tf.ones([chunk_size, chunk_size], dtype=dtype)
    # Create upper diagonal matrix and remove diagonal entries (allow self-attn).
    mask = tf.matrix_band_part(square, 0, -1) - tf.matrix_band_part(
        square, 0, 0)
    # Multiply by -1e6 and expand to broadcast with [B, H, N, N] logits.
    mask = -1e6 * tf.reshape(mask, [1, 1, chunk_size, chunk_size])
    return mask
Esempio n. 4
0
def _create_mask(qlen, mlen, dtype=tf.float32, same_length=False):
    """create causal attention mask."""
    attn_mask = tf.ones([qlen, qlen], dtype=dtype)
    mask_u = tf.matrix_band_part(attn_mask, 0, -1)
    mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
    attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
    ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
    if same_length:
        mask_l = tf.matrix_band_part(attn_mask, -1, 0)
        ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)

    return ret
  def __call__(self,
               inputs_BxIxH,
               inputs_padding_BxI,
               targets_BxTxH,
               training,
               targets_start=None):
    # Mask off padding in inputs when computing attention.
    inputs_attention_bias_Bx1x1xI = tf.expand_dims(
        tf.expand_dims(inputs_padding_BxI * inputs_padding_BxI.dtype.min, 1), 1)

    # Mask off "future" targets to avoid them creeping into predictions when
    # computing loss over an entire targets matrix.
    targets_len = tf.shape(targets_BxTxH)[1]
    upper_triangular_TxT = 1 - tf.matrix_band_part(
        tf.ones((targets_len, targets_len), dtype=inputs_BxIxH.dtype), -1, 0)
    decoder_self_attention_bias_1x1xTxT = tf.expand_dims(
        tf.expand_dims(upper_triangular_TxT, 0), 0) * inputs_BxIxH.dtype.min

    # Pad a zero on the LHS of targets.
    decoder_input_BxTxH = tf.pad(
        targets_BxTxH, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
    decoder_input_BxTxH = TimingSignal()(decoder_input_BxTxH, targets_start)
    decoder_input_BxTxH = tf.layers.dropout(
        decoder_input_BxTxH, self._postprocess_dropout, training=training)

    x_BxTxH = decoder_input_BxTxH
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
      for i, decoder_layer in enumerate(self._decoder_layers):
        with tf.variable_scope("layer_%d" % i):
          x_BxTxH = decoder_layer(x_BxTxH, decoder_self_attention_bias_1x1xTxT,
                                  inputs_BxIxH, inputs_attention_bias_Bx1x1xI,
                                  training)
      decoder_output_BxTxH = self._preprocess_layer(x_BxTxH)

    return decoder_output_BxTxH
Esempio n. 6
0
def _create_mask(qlen, mlen, same_length=False):
    attn_mask = tf.ones([qlen, qlen])
    mask_u = tf.linalg.band_part(attn_mask, 0, -1)
    mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
    attn_mask_pad = tf.zeros([qlen, mlen])
    ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
    if same_length:
        mask_l = tf.matrix_band_part(attn_mask, -1, 0)
        ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
    return ret
Esempio n. 7
0
def cindex_score(y_true, y_pred):

    g = tf.subtract(tf.expand_dims(y_pred, -1), y_pred)
    g = tf.cast(g == 0.0, tf.float32) * 0.5 + tf.cast(g > 0.0, tf.float32)

    f = tf.subtract(tf.expand_dims(y_true, -1), y_true) > 0.0
    f = tf.matrix_band_part(tf.cast(f, tf.float32), -1, 0)

    g = tf.reduce_sum(tf.multiply(g, f))
    f = tf.reduce_sum(f)

    return tf.where(tf.equal(g, 0), 0.0, g / f)  #select
def compute_attention_mask(token_mask, input_mask):
    """Compute attention mask."""
    batch_size = tensor_utils.shape(token_mask, 0)
    num_tokens = tensor_utils.shape(token_mask, 1)
    token_to_token = tf.ones([batch_size, num_tokens, num_tokens],
                             dtype=tf.int32)
    token_to_token = tf.matrix_band_part(token_to_token, -1, 0)
    if input_mask is not None:
        token_to_input = tf.expand_dims(input_mask, 1)
        token_to_input = tf.tile(token_to_input, [1, num_tokens, 1])
        attention_mask = tf.concat([token_to_input, token_to_token], axis=-1)
    else:
        attention_mask = token_to_token
    return attention_mask
Esempio n. 9
0
def max_scoring_span(start_scores, end_scores, max_length, no_answer_bias=0):
    """Compute max scoring span, using the sum of start and end scores.

  Args:
    start_scores: <float32> [batch_size, seq_len]
    end_scores: <float32> [batch_size, seq_len]
    max_length: <int32> Max answer length.
    no_answer_bias: <float32> Log-odds threshold for "no-answer" selection. I.e.
      if log p(span=i,j)/p(span=NULL) > no_answer_bias, then select i, j as the
      span, and NULL otherwise.

  Returns:
    start: <int32> [batch_size]
    end: <int32> [batch_size]
  """
    # Create sparse tensor of size [seq_len].
    seq_len = tensor_utils.shape(start_scores, -1)
    no_answer_bias = tf.scatter_nd([[0]], [no_answer_bias], [seq_len])
    no_answer_bias = tf.cast(no_answer_bias, tf.float32)

    # Apply bias to CLS token logits.
    no_answer_bias = tf.div(no_answer_bias, 2)
    start_scores += tf.expand_dims(no_answer_bias, 0)
    end_scores += tf.expand_dims(no_answer_bias, 0)

    # Compute outer sum, and mask to be upper triangular.
    # This gives a matrix of start[i] + end[j] scores, where j >= i.
    scores = tf.expand_dims(start_scores, 2) + tf.expand_dims(end_scores, 1)
    mask = (1 - tf.matrix_band_part(tf.ones_like(scores), 0, max_length - 1))
    scores -= mask * 1e-4

    def map_fn(inputs):
        flattened = tf.reshape(inputs, [-1])
        argmax = tf.argmax(flattened, output_type=tf.int32)
        indices = tensor_utils.unravel_index_2d(argmax, inputs.shape)
        score = flattened[argmax]
        return indices, score

    # Return i, j indices of max-scoring entry.
    with tf.device("/cpu"):
        endpoints, span_scores = tf.map_fn(fn=map_fn,
                                           elems=scores,
                                           dtype=(tf.int32, tf.float32))
    start = endpoints[:, 0]
    end = endpoints[:, 1]

    return start, end, span_scores
Esempio n. 10
0
    def __call__(self, inputs_BxTxH, training, targets_start=None):
        """TransformerDecoderOnly call operator.

    Args:
      inputs_BxTxH: a 3d-tensor representing decoder inputs; during training
                    essentially the right-shifted decoder targets
      training: bool indicating whether we are in training mode
      targets_start: starting index for adding position information

    Returns:
      3d-tensor of shape [B, T, H] representing decoder outputs
    """
        # Mask off "future" targets to avoid them creeping into predictions when
        # computing loss over an entire targets matrix.
        targets_len = tf.shape(inputs_BxTxH)[1]
        upper_triangular_TxT = 1 - tf.matrix_band_part(
            tf.ones((targets_len, targets_len), dtype=tf.float32), -1, 0)
        # For example, when targets_len == 3, upper_triangular_TxT is:
        # [[0., 1., 1.],
        #  [0., 0., 1.],
        #  [0., 0., 0.]]
        masked_attention_bias_1x1xTxT = tf.expand_dims(
            tf.expand_dims(upper_triangular_TxT, 0), 0) * tf.float32.min
        # For example, masked_attention_bias_1x1xTxT is:
        # [[[[0., -inf, -inf],
        #    [0.,   0., -inf],
        #    [0.,   0.,   0.]]]]

        # No padding is needed here as inputs_BxTxH is already prepended with
        # special token.
        decoder_input_BxTxH = TimingSignal()(inputs_BxTxH, targets_start)
        decoder_input_BxTxH = tf.layers.dropout(decoder_input_BxTxH,
                                                self._postprocess_dropout,
                                                training=training)

        x_BxTxH = decoder_input_BxTxH
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            for i, decoder_layer in enumerate(self._decoder_layers):
                with tf.variable_scope("layer_%d" % i):
                    x_BxTxH = decoder_layer(x_BxTxH,
                                            masked_attention_bias_1x1xTxT,
                                            training)
            decoder_output_BxTxH = self._preprocess_layer(x_BxTxH)

        return decoder_output_BxTxH
Esempio n. 11
0
    def __init__(self, posts, **kwargs):
        FactorisedPosterior.__init__(self, posts, **kwargs)

        # The full covariance matrix is formed from the Cholesky decomposition
        # to ensure that it remains positive definite.
        #
        # To achieve this, we have to create PxP tensor variables for
        # each parameter vertex, but we then extract only the lower triangular
        # elements and train only on these. The diagonal elements
        # are constructed by the FactorisedPosterior
        if kwargs.get("init", None):
            # We are initializing from an existing posterior.
            # The FactorizedPosterior will already have extracted the mean and
            # diagonal of the covariance matrix - we need the Cholesky decomposition
            # of the covariance to initialize the off-diagonal terms
            self.log.info(" - Initializing posterior covariance from input posterior")
            _mean, cov = kwargs["init"]
            covar_init = tf.cholesky(cov)
        else:
            covar_init = tf.zeros([self.nvertices, self.nparams, self.nparams], dtype=tf.float32)

        self.off_diag_vars_base = self.log_tf(tf.Variable(covar_init, validate_shape=False,
                                                     name='%s_off_diag_vars' % self.name))
        if kwargs.get("suppress_nan", True):
            self.off_diag_vars = tf.where(tf.is_nan(self.off_diag_vars_base), tf.zeros_like(self.off_diag_vars_base), self.off_diag_vars_base)
        else:
            self.off_diag_vars = self.off_diag_vars_base
        self.off_diag_cov_chol = tf.matrix_set_diag(tf.matrix_band_part(self.off_diag_vars, -1, 0),
                                                    tf.zeros([self.nvertices, self.nparams]),
                                                    name='%s_off_diag_cov_chol' % self.name)

        # Combine diagonal and off-diagonal elements into full matrix
        self.cov_chol = tf.add(tf.matrix_diag(self.std), self.off_diag_cov_chol,
                               name='%s_cov_chol' % self.name)

        # Form the covariance matrix from the chol decomposition
        self.cov = tf.matmul(tf.transpose(self.cov_chol, perm=(0, 2, 1)), self.cov_chol,
                             name='%s_cov' % self.name)

        self.cov_chol = self.log_tf(self.cov_chol)
        self.cov = self.log_tf(self.cov)
Esempio n. 12
0
def group_v2_deconv_decoder(latent_tensor,
                            output_shape,
                            hy_ncut=1,
                            group_feats_size=gin.REQUIRED,
                            lie_alg_init_scale=gin.REQUIRED,
                            lie_alg_init_type=gin.REQUIRED,
                            n_act_points=gin.REQUIRED,
                            is_training=True):
    """Convolutional decoder used in beta-VAE paper for the chairs data.

    Based on row 3 of Table 1 on page 13 of "beta-VAE: Learning Basic Visual
    Concepts with a Constrained Variational Framework"
    (https://openreview.net/forum?id=Sy2fzU9gl)
    Here we add an extra linear mapping for group features extraction.

    Args:
        latent_tensor: Input tensor of shape (batch_size,) to connect decoder to.
        output_shape: Shape of the data.
        group_feats_size: The dimension of group features.
        is_training: Whether or not the graph is built for training (UNUSED).

    Returns:
        Output tensor of shape (batch_size, 64, 64, num_channels) with the [0,1]
          pixel intensities.
        group_feats: Group features.
    """
    # del is_training

    lie_alg_basis_ls = []
    latent_dim = latent_tensor.get_shape().as_list()[-1]
    latents_in_cut_ls = split_latents(latent_tensor,
                                      hy_ncut=hy_ncut)  # [x0, x1]

    mat_dim = int(math.sqrt(group_feats_size))
    for i in range(latent_dim):
        init = tf.initializers.random_normal(0, lie_alg_init_scale)
        lie_alg_tmp = tf.get_variable('lie_alg_' + str(i),
                                      shape=[1, mat_dim, mat_dim],
                                      initializer=init)
        if lie_alg_init_type == 'oth':
            lie_alg_tmp = tf.matrix_band_part(lie_alg_tmp, 0, -1)
            lie_alg_tmp = lie_alg_tmp - tf.transpose(lie_alg_tmp,
                                                     perm=[0, 2, 1])
        lie_alg_basis_ls.append(lie_alg_tmp)
    lie_alg_basis = tf.concat(lie_alg_basis_ls,
                              axis=0)[tf.newaxis,
                                      ...]  # [1, lat_dim, mat_dim, mat_dim]

    lie_alg = 0
    lie_group = tf.eye(mat_dim, dtype=lie_alg_basis_ls[0].dtype)[tf.newaxis,
                                                                 ...]
    for i, lie_alg_basis_i in enumerate(lie_alg_basis_ls):
        lie_alg_tmp = lie_alg_basis_i * latent_tensor[:, i][..., tf.newaxis,
                                                            tf.newaxis]
        lie_alg = lie_alg + lie_alg_tmp
        lie_group_tmp = tf.linalg.expm(lie_alg_tmp)  # [b, mat_dim, mat_dim]
        lie_group = tf.matmul(lie_group_tmp, lie_group)
    # if not is_training:
    # lie_alg_mul = latent_tensor[
    # ..., tf.newaxis, tf.
    # newaxis] * lie_alg_basis  # [b, lat_dim, mat_dim, mat_dim]
    # lie_alg = tf.reduce_sum(lie_alg_mul, axis=1)  # [b, mat_dim, mat_dim]
    # lie_group = tf.linalg.expm(lie_alg)  # [b, mat_dim, mat_dim]
    # else:

    # lie_group = tf.eye(
    # mat_dim,
    # dtype=latents_in_cut_ls[0].dtype)[tf.newaxis, ...]
    # lie_alg = 0
    # for latents_in_cut_i in latents_in_cut_ls:
    # lie_alg_mul_tmp = latents_in_cut_i[
    # ..., tf.newaxis, tf.newaxis] * lie_alg_basis  # [b, lat_dim, mat_dim, mat_dim]
    # lie_alg_tmp = tf.reduce_sum(
    # lie_alg_mul_tmp,
    # axis=1)  # [b, mat_dim, mat_dim]
    # lie_alg = lie_alg + lie_alg_tmp
    # lie_group_tmp = tf.linalg.expm(
    # lie_alg_tmp)  # [b, mat_dim, mat_dim]
    # lie_group = tf.matmul(lie_group,
    # lie_group_tmp)

    transed_act_points_tensor = tf.reshape(lie_group, [-1, mat_dim * mat_dim])

    # lie_alg_mul = latent_tensor[
    # ..., tf.newaxis, tf.
    # newaxis] * lie_alg_basis  # [b, lat_dim, mat_dim, mat_dim]
    # lie_alg = tf.reduce_sum(lie_alg_mul, axis=1)  # [b, mat_dim, mat_dim]
    # lie_group = tf.linalg.expm(lie_alg)  # [b, mat_dim, mat_dim]

    # act_init = tf.initializers.random_normal(0, 0.01)
    # act_points = tf.get_variable('act_points',
    # shape=[1, mat_dim, n_act_points],
    # initializer=act_init)
    # transed_act_points = tf.matmul(lie_group, act_points)
    # transed_act_points_tensor = tf.reshape(transed_act_points,
    # [-1, mat_dim * n_act_points])

    d1 = tf.layers.dense(transed_act_points_tensor, 256, activation=tf.nn.relu)
    d2 = tf.layers.dense(d1, 1024, activation=tf.nn.relu)
    d2_reshaped = tf.reshape(d2, shape=[-1, 4, 4, 64])

    d3 = tf.layers.conv2d_transpose(
        inputs=d2_reshaped,
        filters=64,
        kernel_size=4,
        strides=2,
        activation=tf.nn.relu,
        padding="same",
    )

    d4 = tf.layers.conv2d_transpose(
        inputs=d3,
        filters=32,
        kernel_size=4,
        strides=2,
        activation=tf.nn.relu,
        padding="same",
    )

    d5 = tf.layers.conv2d_transpose(
        inputs=d4,
        filters=32,
        kernel_size=4,
        strides=2,
        activation=tf.nn.relu,
        padding="same",
    )
    d6 = tf.layers.conv2d_transpose(
        inputs=d5,
        filters=output_shape[2],
        kernel_size=4,
        strides=2,
        padding="same",
    )
    return tf.reshape(d6, [-1] + output_shape), lie_group, lie_alg_basis
Esempio n. 13
0
    def __init__(self, lr, batch_size, dimension, util_train, util_test, campaign, reg_lambda, sigma):
        # hyperparameters
        self.lr = lr
        self.batch_size = batch_size
        self.util_train = util_train
        self.util_test = util_test
        self.reg_lambda = reg_lambda
        self.sigma = sigma
        self.emb_size = 20

        self.train_data_amt = util_train.get_data_amt()
        self.test_data_amt = util_test.get_data_amt()

        # output dir
        model_name = "{}_{}_{}_{}".format(self.lr, self.reg_lambda, self.batch_size, self.sigma)
        self.output_dir = "output/deephit/{}/{}/".format(campaign, model_name)
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        
        # reset graph
        tf.reset_default_graph()

        # field params
        self.field_sizes = self.util_train.feat_sizes
        self.field_num = len(self.field_sizes)

        # placeholders
        self.X = [tf.sparse_placeholder(tf.float64) for i in range(0, self.field_num)]
        self.z = tf.placeholder(tf.float64)
        self.b = tf.placeholder(tf.float64)
        self.y = tf.placeholder(tf.float64)

        # embedding layer
        self.var_map = {}
        # for truncated
        self.var_map['embed_0'] = tf.Variable(
                tf.truncated_normal([self.field_sizes[0], 1], dtype=tf.float64))
        for i in range(1, self.field_num):
            self.var_map['embed_%d' % i] = tf.Variable(
                tf.truncated_normal([self.field_sizes[i], self.emb_size], dtype=tf.float64))
        
        # after embedding
        w0 = [self.var_map['embed_%d' % i] for i in range(self.field_num)]
        self.dense_input = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(self.field_num)], 1)

        # shared network
        self.hidden1 = tf.Variable(initial_value=tf.truncated_normal(shape=[(self.field_num - 1) * self.emb_size + 1, HIDDEN_SIZE1], dtype=tf.float64), name='h1')
        self.out1 = tf.Variable(initial_value=tf.truncated_normal(shape=[HIDDEN_SIZE1, OUT_SIZE1], dtype=tf.float64), name='o1')
        self.hidden2 = tf.Variable(initial_value=tf.truncated_normal(shape=[OUT_SIZE1, HIDDEN_SIZE2], dtype=tf.float64), name='h2')
        self.out2 = tf.Variable(initial_value=tf.truncated_normal(shape=[HIDDEN_SIZE2, OUT_SIZE2], dtype=tf.float64), name='o2')

        # cause-specific network
        self.hidden1_val = tf.nn.relu(tf.matmul(self.dense_input, self.hidden1))
        self.out1_val = tf.sigmoid(tf.matmul(self.hidden1_val, self.out1))
        self.hidden2_val = tf.nn.relu(tf.matmul(self.out1_val, self.hidden2))
        self.out2_val = tf.sigmoid(tf.matmul(self.hidden2_val, self.out2))

        # p_z and w_b
        self.p = tf.nn.softmax(self.out2_val)
        self.w = tf.cumsum(self.p, exclusive=True, axis = 1)

        idx_z = tf.stack([tf.reshape(tf.range(tf.shape(self.z)[0]), (-1,1)), tf.cast(self.z - 1, tf.int32)], axis=-1)
        idx_b = tf.stack([tf.reshape(tf.range(tf.shape(self.b)[0]), (-1,1)), tf.cast(self.b - 1, tf.int32)], axis=-1)

        self.pz = tf.gather_nd(self.p, idx_z)
        self.wb = tf.gather_nd(self.w, idx_b)
        self.wz = tf.gather_nd(self.w, idx_z)

        # loss and train step
        self.loss1 = -tf.reduce_sum(tf.log(tf.clip_by_value(self.pz, 1e-8, 1.0)) * self.y)
        self.loss2 = -tf.reduce_sum(tf.log(tf.clip_by_value(1 - self.wb, 1e-8, 1.0)) * (1 - self.y))
        self.reg_loss = tf.nn.l2_loss(self.hidden1[1:,]) + tf.nn.l2_loss(self.hidden2[1:,]) + \
                        tf.nn.l2_loss(self.out1[1:,]) + tf.nn.l2_loss(self.out2[1:,])

        # get ranking loss
        self.w_of_pair = tf.transpose(tf.nn.embedding_lookup(tf.transpose(self.w), tf.cast(self.z[:,0] - 1, tf.int32)))
        self.w_of_self = tf.reshape(tf.tile(tf.reshape(self.wz, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size))
        self.win_label = tf.reshape(tf.tile(tf.reshape(self.y, (self.batch_size, )), [self.batch_size]), (self.batch_size, self.batch_size))
        self.delta = self.w_of_self - self.w_of_pair
        self.candidate = tf.exp(-self.delta / self.sigma)
        self.rank_loss = tf.reduce_sum(tf.matrix_band_part(self.candidate, -1, 0) * self.win_label)

        self.loss = self.loss1 + self.loss2 + self.reg_lambda * self.reg_loss + self.rank_loss

        self.optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train_step = self.optimizer.minimize(self.loss)

        # session initialization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        tf.global_variables_initializer().run(session=self.sess)
Esempio n. 14
0
    def __init__(
        self,
        bert_config,
        is_training,
        input_ids,
        input_mask=None,
        token_type_ids=None,
        use_one_hot_embeddings=True,
        scope=None,
        embedding_size=None,
        input_embeddings=None,
        input_reprs=None,
        update_embeddings=True,
        untied_embeddings=False,
        ltr=False,
        rtl=False,
    ):
        """Constructor for BertModel.

    Args:
      bert_config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
        it is much faster if this is True, on the CPU or GPU, it is faster if
        this is False.
      scope: (optional) variable scope. Defaults to "electra".

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        bert_config = copy.deepcopy(bert_config)
        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(token_type_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        assert token_type_ids is not None

        if input_reprs is None:
            if input_embeddings is None:
                with tf.variable_scope(
                    (scope if untied_embeddings else 'electra') +
                        '/embeddings',
                        reuse=tf.AUTO_REUSE,
                ):
                    # Perform embedding lookup on the word ids
                    if embedding_size is None:
                        embedding_size = bert_config.hidden_size
                    (
                        self.token_embeddings,
                        self.embedding_table,
                    ) = embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        embedding_size=embedding_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name='word_embeddings',
                        use_one_hot_embeddings=use_one_hot_embeddings,
                    )
            else:
                self.token_embeddings = input_embeddings

            with tf.variable_scope(
                (scope if untied_embeddings else 'electra') + '/embeddings',
                    reuse=tf.AUTO_REUSE,
            ):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.token_embeddings,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name='token_type_embeddings',
                    use_position_embeddings=True,
                    position_embedding_name='position_embeddings',
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=bert_config.
                    max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob,
                )
        else:
            self.embedding_output = input_reprs
        if not update_embeddings:
            self.embedding_output = tf.stop_gradient(self.embedding_output)

        with tf.variable_scope(scope, default_name='electra'):
            if self.embedding_output.shape[-1] != bert_config.hidden_size:
                self.embedding_output = tf.layers.dense(
                    self.embedding_output,
                    bert_config.hidden_size,
                    name='embeddings_project',
                )

            with tf.variable_scope('encoder'):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    token_type_ids, input_mask)

                # Add causal masking to the attention for running the transformer
                # left-to-right or right-to-left
                if ltr or rtl:
                    causal_mask = tf.ones((seq_length, seq_length))
                    if ltr:
                        causal_mask = tf.matrix_band_part(causal_mask, -1, 0)
                    else:
                        causal_mask = tf.matrix_band_part(causal_mask, 0, -1)
                    attention_mask *= tf.expand_dims(causal_mask, 0)

                # Run the stacked transformer. Output shapes
                # sequence_output: [batch_size, seq_length, hidden_size]
                # pooled_output: [batch_size, hidden_size]
                # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size].
                # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
                (self.all_layer_outputs, self.attn_maps) = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=get_activation(bert_config.hidden_act),
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.
                    attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=True,
                )
                self.sequence_output = self.all_layer_outputs[-1]
                self.pooled_output = self.sequence_output[:, 0]
Esempio n. 15
0
def mask_attn_weights(w):
    n = shape_list(w)[-1]
    b = tf.matrix_band_part(tf.ones([n, n]), -1, 0)
    b = tf.reshape(b, [1, 1, n, n])
    w = w*b + -1e9*(1-b)
    return w
Esempio n. 16
0
def _constrain_prob_mat(prob_mat, max_answer_size):
  """Constraint prob mat such that start <= end < start + max_answer_size."""
  # prob_mat has shape [batch, doc_len, doc_len]
  max_x_len = tf.shape(prob_mat)[1]
  max_answer_length = tf.to_int64(tf.minimum(max_x_len, max_answer_size))
  return tf.matrix_band_part(prob_mat, 0, max_answer_length - 1)