Exemple #1
0
def mask(inputs, key_masks=None, type=None):
    '''Masks paddings on keys or queries to inputs
    inputs: 3d tensor. (h*N, T_q, T_k)
    key_masks: 3d tensor. (N, 1, T_k)
    type: string. 'key' | 'future'

    e.g.,
    >> inputs = tf.zeros([2, 2, 3], dtype=tf.float32)
    >> key_masks = tf.constant([[0., 0., 1.],
                                [0., 1., 1.]])
    >> mask(inputs, key_masks=key_masks, type='key')
    array([[[ 0.0000000e+00,  0.0000000e+00, -4.2949673e+09],
        [ 0.0000000e+00,  0.0000000e+00, -4.2949673e+09]],

       [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09],
        [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]],

       [[ 0.0000000e+00,  0.0000000e+00, -4.2949673e+09],
        [ 0.0000000e+00,  0.0000000e+00, -4.2949673e+09]],

       [[ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09],
        [ 0.0000000e+00, -4.2949673e+09, -4.2949673e+09]]], dtype=float32)
    '''
    padding_num = -2 ** 32 + 1
    if type in ('k', 'key', 'keys'):
        key_masks = tf.to_float(key_masks)
        key_masks = tf.tile(
            key_masks,
            [tf.shape(inputs)[0] // tf.shape(key_masks)[0], 1]) # (h*N, seqlen)
        key_masks = tf.expand_dims(key_masks, 1)  # (h*N, 1, seqlen)
        outputs = inputs + key_masks * padding_num
    # elif type in ('q', 'query', 'queries'):
    #     # Generate masks
    #     masks = tf.sign(tf.reduce_sum(tf.abs(queries), axis=-1))  # (N, T_q)
    #     masks = tf.expand_dims(masks, -1)  # (N, T_q, 1)
    #     masks = tf.tile(masks, [1, 1, tf.shape(keys)[1]])  # (N, T_q, T_k)
    #
    #     # Apply masks to inputs
    #     outputs = inputs*masks
    elif type in ('f', 'future', 'right'):
        diag_vals = tf.ones_like(inputs[0, :, :])  # (T_q, T_k)
        tril = tf.linalg.LinearOperatorLowerTriangular(
            diag_vals).to_dense()  # (T_q, T_k)
        future_masks = tf.tile(
            tf.expand_dims(tril, 0),
            [tf.shape(inputs)[0], 1, 1])  # (N, T_q, T_k)

        paddings = tf.ones_like(future_masks) * padding_num
        outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs)
    else:
        print('Check if you entered type correctly!')

    return outputs
Exemple #2
0
    def __init__(self,
                 is_training,
                 input_tensor,
                 is_supervised,
                 is_expanded,
                 label_ids,
                 label_size=2,
                 sample_weight=None,
                 scope='cls/seq_relationship',
                 hidden_dropout_prob=0.1,
                 initializer_range=0.02,
                 trainable=True,
                 global_step=None,
                 num_train_steps=None,
                 uda_softmax_temp=-1,
                 uda_confidence_thresh=-1,
                 tsa_schedule='linear',
                 **kwargs):
        super().__init__(**kwargs)

        is_supervised = tf.cast(is_supervised, tf.float32)
        is_expanded = tf.cast(is_expanded, tf.float32)

        hidden_size = input_tensor.shape.as_list()[-1]
        with tf.variable_scope(scope):
            output_weights = tf.get_variable(
                'output_weights',
                shape=[label_size, hidden_size],
                initializer=util.create_initializer(initializer_range),
                trainable=trainable)
            output_bias = tf.get_variable('output_bias',
                                          shape=[label_size],
                                          initializer=tf.zeros_initializer(),
                                          trainable=trainable)

            output_layer = util.dropout(
                input_tensor, hidden_dropout_prob if is_training else 0.0)
            logits = tf.matmul(output_layer, output_weights, transpose_b=True)
            logits = tf.nn.bias_add(logits, output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            with tf.variable_scope('sup_loss'):

                # reshape
                sup_ori_log_probs = tf.boolean_mask(log_probs,
                                                    mask=(1.0 - is_expanded),
                                                    axis=0)
                sup_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=is_supervised,
                                                axis=0)
                sup_label_ids = tf.boolean_mask(label_ids,
                                                mask=is_supervised,
                                                axis=0)

                self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1)

                one_hot_labels = tf.one_hot(sup_label_ids,
                                            depth=label_size,
                                            dtype=tf.float32)
                per_example_loss = -tf.reduce_sum(
                    one_hot_labels * sup_log_probs, axis=-1)

                loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32)
                correct_label_probs = tf.reduce_sum(one_hot_labels *
                                                    tf.exp(sup_log_probs),
                                                    axis=-1)

                if is_training and tsa_schedule:
                    tsa_start = 1.0 / label_size
                    tsa_threshold = get_tsa_threshold(tsa_schedule,
                                                      global_step,
                                                      num_train_steps,
                                                      tsa_start,
                                                      end=1)

                    larger_than_threshold = tf.greater(correct_label_probs,
                                                       tsa_threshold)
                    loss_mask = loss_mask * (
                        1 - tf.cast(larger_than_threshold, tf.float32))

                loss_mask = tf.stop_gradient(loss_mask)
                per_example_loss = per_example_loss * loss_mask
                if sample_weight is not None:
                    sup_sample_weight = tf.boolean_mask(sample_weight,
                                                        mask=is_supervised,
                                                        axis=0)
                    per_example_loss *= tf.cast(sup_sample_weight,
                                                dtype=tf.float32)
                sup_loss = (tf.reduce_sum(per_example_loss) /
                            tf.maximum(tf.reduce_sum(loss_mask), 1))

                self.losses['supervised'] = per_example_loss

            with tf.variable_scope('unsup_loss'):

                # reshape
                ori_log_probs = tf.boolean_mask(sup_ori_log_probs,
                                                mask=(1.0 - is_supervised),
                                                axis=0)
                aug_log_probs = tf.boolean_mask(log_probs,
                                                mask=is_expanded,
                                                axis=0)
                sup_ori_logits = tf.boolean_mask(logits,
                                                 mask=(1.0 - is_expanded),
                                                 axis=0)
                ori_logits = tf.boolean_mask(sup_ori_logits,
                                             mask=(1.0 - is_supervised),
                                             axis=0)

                unsup_loss_mask = 1
                if uda_softmax_temp != -1:
                    tgt_ori_log_probs = tf.nn.log_softmax(ori_logits /
                                                          uda_softmax_temp,
                                                          axis=-1)
                    tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs)
                else:
                    tgt_ori_log_probs = tf.stop_gradient(ori_log_probs)

                if uda_confidence_thresh != -1:
                    largest_prob = tf.reduce_max(tf.exp(ori_log_probs),
                                                 axis=-1)
                    unsup_loss_mask = tf.cast(
                        tf.greater(largest_prob, uda_confidence_thresh),
                        tf.float32)
                    unsup_loss_mask = tf.stop_gradient(unsup_loss_mask)

                per_example_loss = kl_for_log_probs(
                    tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask
                if sample_weight is not None:
                    unsup_sample_weight = tf.boolean_mask(sample_weight,
                                                          mask=(1.0 -
                                                                is_supervised),
                                                          axis=0)
                    per_example_loss *= tf.cast(unsup_sample_weight,
                                                dtype=tf.float32)
                unsup_loss = tf.reduce_mean(per_example_loss)

                self.losses['unsupervised'] = per_example_loss

            self.total_loss = sup_loss + unsup_loss
Exemple #3
0
    def dynamic_transformer_model(self,
                                  is_training,
                                  input_tensor,
                                  input_mask,
                                  batch_size,
                                  max_seq_length,
                                  label_size,
                                  attention_mask=None,
                                  hidden_size=768,
                                  num_hidden_layers=12,
                                  num_attention_heads=12,
                                  intermediate_size=3072,
                                  intermediate_act_fn=util.gelu,
                                  hidden_dropout_prob=0.1,
                                  attention_probs_dropout_prob=0.1,
                                  initializer_range=0.02,
                                  dtype=tf.float32,
                                  cls_model='self-attention',
                                  cls_hidden_size=128,
                                  cls_num_attention_heads=2,
                                  speed=0.1,
                                  ignore_cls=None):
        if hidden_size % num_attention_heads != 0:
            raise ValueError(
                'The hidden size (%d) is not a multiple of the number of '
                'attention heads (%d)' % (hidden_size, num_attention_heads))
        attention_head_size = int(hidden_size / num_attention_heads)

        keep_cls = list(range(num_hidden_layers + 1))
        keep_cls = [
            cls_idx for cls_idx in keep_cls if cls_idx not in ignore_cls
        ]

        all_layer_outputs = []
        all_layer_cls_outputs = collections.OrderedDict()
        prev_output = input_tensor
        prev_mask = input_mask
        for layer_idx in range(num_hidden_layers):
            with tf.variable_scope('layer_%d' % layer_idx):

                # build child classifier
                if is_training or layer_idx not in ignore_cls:
                    with tf.variable_scope('distill'):

                        # FCN + Self_Attention + FCN + FCN
                        if cls_model == 'self-attention-paper':
                            cls_output = self._cls_self_attention_paper(
                                prev_output,
                                batch_size,
                                max_seq_length,
                                label_size,
                                attention_mask=attention_mask,
                                cls_hidden_size=cls_hidden_size,
                                cls_num_attention_heads=\
                                    cls_num_attention_heads,
                                attention_probs_dropout_prob=\
                                    attention_probs_dropout_prob,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        # Self_Attention + FCN
                        elif cls_model == 'self-attention':
                            cls_output = self._cls_self_attention(
                                prev_output,
                                batch_size,
                                max_seq_length,
                                label_size,
                                attention_mask=attention_mask,
                                cls_hidden_size=cls_hidden_size,
                                cls_num_attention_heads=\
                                    cls_num_attention_heads,
                                attention_probs_dropout_prob=\
                                    attention_probs_dropout_prob,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        # FCN
                        elif cls_model == 'fcn':
                            cls_output = self._cls_fcn(
                                prev_output,
                                label_size,
                                hidden_size=hidden_size,
                                initializer_range=initializer_range,
                                dtype=tf.float32,
                                trainable=True)

                        else:
                            raise ValueError(
                                'Invalid `cls_model = %s`. Pick one from '
                                '`self-attention-paper`, `self-attention` '
                                'and `fcn`' % cls_model)

                        # distill core
                        layer_cls_output = tf.nn.softmax(cls_output,
                                                         axis=-1,
                                                         name='cls_%d' %
                                                         layer_idx)
                        uncertainty = tf.reduce_sum(layer_cls_output *
                                                    tf.log(layer_cls_output),
                                                    axis=-1)
                        uncertainty /= tf.log(1 / label_size)

                    # branching only in inference
                    if not is_training:

                        # last output
                        if layer_idx == keep_cls[-1]:
                            all_layer_outputs.append(prev_output)
                            all_layer_cls_outputs[layer_idx] = layer_cls_output
                            return (all_layer_outputs, all_layer_cls_outputs)

                        mask = tf.less(uncertainty, speed)
                        unfinished_mask = \
                            (tf.ones_like(mask, dtype=dtype) -
                             tf.cast(mask, dtype=dtype))
                        prev_output = tf.boolean_mask(prev_output,
                                                      mask=unfinished_mask,
                                                      axis=0)
                        prev_mask = tf.boolean_mask(prev_mask,
                                                    mask=unfinished_mask,
                                                    axis=0)
                    all_layer_cls_outputs[layer_idx] = layer_cls_output

                    # new attention mask
                    input_shape = util.get_shape_list(prev_output)
                    batch_size = input_shape[0]
                    max_seq_length = input_shape[1]
                    attention_mask = \
                        self.create_attention_mask_from_input_mask(
                            prev_mask, batch_size, max_seq_length, dtype=dtype)

                # originial stream
                with tf.variable_scope('attention'):
                    attention_heads = []
                    with tf.variable_scope('self'):
                        (attention_head, _) = self.attention_layer(
                            from_tensor=prev_output,
                            to_tensor=prev_output,
                            attention_mask=attention_mask,
                            num_attention_heads=num_attention_heads,
                            size_per_head=attention_head_size,
                            attention_probs_dropout_prob=\
                                attention_probs_dropout_prob,
                            initializer_range=initializer_range,
                            do_return_2d_tensor=False,
                            batch_size=batch_size,
                            from_max_seq_length=max_seq_length,
                            to_max_seq_length=max_seq_length,
                            dtype=dtype,
                            trainable=False)
                        attention_heads.append(attention_head)

                    attention_output = None
                    if len(attention_heads) == 1:
                        attention_output = attention_heads[0]
                    else:
                        attention_output = tf.concat(attention_heads, axis=-1)

                    with tf.variable_scope('output'):
                        attention_output = tf.layers.dense(
                            attention_output,
                            hidden_size,
                            kernel_initializer=util.create_initializer(
                                initializer_range),
                            trainable=False)
                        attention_output = util.dropout(
                            attention_output, hidden_dropout_prob)
                        attention_output = util.layer_norm(attention_output +
                                                           prev_output,
                                                           trainable=False)

                # The activation is only applied to the `intermediate`
                # hidden layer.
                with tf.variable_scope('intermediate'):
                    intermediate_output = tf.layers.dense(
                        attention_output,
                        intermediate_size,
                        activation=intermediate_act_fn,
                        kernel_initializer=util.create_initializer(
                            initializer_range),
                        trainable=False)

                # Down-project back to hidden_size then add the residual.
                with tf.variable_scope('output'):
                    layer_output = tf.layers.dense(
                        intermediate_output,
                        hidden_size,
                        kernel_initializer=util.create_initializer(
                            initializer_range),
                        trainable=False)
                    layer_output = util.dropout(layer_output,
                                                hidden_dropout_prob)
                    layer_output = util.layer_norm(layer_output +
                                                   attention_output,
                                                   trainable=False)

                prev_output = layer_output
                all_layer_outputs.append(layer_output)

        return (all_layer_outputs, all_layer_cls_outputs)