Python multihead_attention Examples

Programming Language: Python

Namespace/Package Name: model.attention

Method/Function: multihead_attention

Examples at hotexamples.com: 4

Python multihead_attention - 4 examples found. These are the top rated real world Python examples of model.attention.multihead_attention extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

  def transformer(self, num_blocks=2, num_heads=5, mid_layer='feed_forward'):
    word_embs = glove(self.words, self.params['words'], self.params['glove'])
    char_embs = get_char_representations(
      self.chars, self.nchars, 
      self.params['chars'], mode='lstm',
      training=self.training
    )
    html_embs = get_soft_html_representations(
      self.html, self.params['html_tags'],
      self.css_chars, self.css_lengths,
      self.params['chars'], training=self.training
    )

    embs = tf.concat([word_embs, char_embs, html_embs], axis=-1)
    # embs = word_embs
    # embs += pos_embeddings(embs, 1000)

    x = self.dropout(embs)

    for i in range(num_blocks):
      output = multihead_attention(
        queries=x,
        keys=x,
        values=x,
        num_heads=num_heads,
        dropout_rate=0.5,
        training=self.training,
        causality=False
      )

      if mid_layer == 'feed_forward':
        output = tf.layers.dense(output, 450, activation=tf.nn.relu)
        output = tf.layers.dense(output, 450)

        # Residual connection
        output += x 
        
        # Normalize
        x = normalize(output)

      elif mid_layer == 'lstm':
        # Bidirectional LSTM will output a tensor with a shape that is twice 
        # the hidden layer size.
        x = self.lstm(x, x.shape[2].value / 2, var_scope='transformer_' + str(i)) + x 
    return x

Example #2

Show file

File: entity_att_lstm.py Project: NEUNLP-RE/Entity-aware-RC

    def __init__(self,
                 sequence_length,
                 num_classes,
                 vocab_size,
                 embedding_size,
                 pos_vocab_size,
                 pos_embedding_size,
                 hidden_size,
                 num_heads,
                 attention_size,
                 use_elmo=False,
                 l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32,
                                      shape=[None, sequence_length],
                                      name='input_x')
        self.input_y = tf.placeholder(tf.float32,
                                      shape=[None, num_classes],
                                      name='input_y')
        self.input_text = tf.placeholder(tf.string,
                                         shape=[
                                             None,
                                         ],
                                         name='input_text')
        self.input_e1 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e1')
        self.input_e2 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e2')
        self.input_p1 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p1')
        self.input_p2 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p2')
        self.emb_dropout_keep_prob = tf.placeholder(
            tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(
            tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name='dropout_keep_prob')

        if use_elmo:
            # Contextual Embedding Layer
            with tf.variable_scope("elmo-embeddings"):
                elmo_model = hub.Module("https://tfhub.dev/google/elmo/2",
                                        trainable=True)
                self.embedded_chars = elmo_model(self.input_text,
                                                 signature="default",
                                                 as_dict=True)["elmo"]
        else:
            # Word Embedding Layer
            with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
                self.W_text = tf.Variable(tf.random_uniform(
                    [vocab_size, embedding_size], -0.25, 0.25),
                                          name="W_text")
                self.embedded_chars = tf.nn.embedding_lookup(
                    self.W_text, self.input_x)

        # Position Embedding Layer
        with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"):
            self.W_pos = tf.get_variable("W_pos",
                                         [pos_vocab_size, pos_embedding_size],
                                         initializer=initializer())
            self.p1 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p1)[:, :tf.shape(self.embedded_chars)[1]]
            self.p2 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p2)[:, :tf.shape(self.embedded_chars)[1]]

        # Dropout for Word Embedding
        with tf.variable_scope('dropout-embeddings'):
            self.embedded_chars = tf.nn.dropout(self.embedded_chars,
                                                self.emb_dropout_keep_prob)

        # Self Attention
        with tf.variable_scope("self-attention"):
            self.self_attn, self.self_alphas = multihead_attention(
                self.embedded_chars,
                self.embedded_chars,
                num_units=embedding_size,
                num_heads=num_heads)

        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell,
                                                    self.rnn_dropout_keep_prob)
            _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell,
                                                    self.rnn_dropout_keep_prob)
            self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cell,
                cell_bw=bw_cell,
                inputs=self.self_attn,
                sequence_length=self._length(self.input_x),
                dtype=tf.float32)
            self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1)

        # Attention
        with tf.variable_scope('attention'):
            self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention(
                self.rnn_outputs,
                self.input_e1,
                self.input_e2,
                self.p1,
                self.p2,
                attention_size=attention_size)

        # Dropout
        with tf.variable_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)

        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop,
                                          num_classes,
                                          kernel_initializer=initializer())
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=self.logits, labels=self.input_y)
            self.l2 = tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   tf.float32),
                                           name="accuracy")

Example #3

Show file

File: entity_att_lstm.py Project: taoqiongxing/Can-Syntactic-Indicators-Help-Relation-Extraction

    def __init__(self,
                 sequence_length,
                 rw_length,
                 num_classes,
                 vocab_size,
                 rw_vocab_size,
                 rw_pos_vocab_size,
                 embedding_size,
                 pos_vocab_size,
                 pos_embedding_size,
                 hidden_size,
                 num_heads,
                 attention_size,
                 use_elmo=False,
                 l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32,
                                      shape=[None, sequence_length],
                                      name='input_x')
        self.input_y = tf.placeholder(tf.float32,
                                      shape=[None, num_classes],
                                      name='input_y')
        self.input_text = tf.placeholder(tf.string,
                                         shape=[
                                             None,
                                         ],
                                         name='input_text')
        self.input_e1 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e1')
        self.input_e2 = tf.placeholder(tf.int32,
                                       shape=[
                                           None,
                                       ],
                                       name='input_e2')
        self.input_p1 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p1')
        self.input_p2 = tf.placeholder(tf.int32,
                                       shape=[None, sequence_length],
                                       name='input_p2')
        self.input_rw_x = tf.placeholder(tf.int32,
                                         shape=[None, rw_length],
                                         name='input_rw_x')  ########
        self.input_rw_text = tf.placeholder(tf.string,
                                            shape=[
                                                None,
                                            ],
                                            name='input_rw_text')  #######
        self.input_rw_pos_x = tf.placeholder(tf.int32,
                                             shape=[None, rw_length],
                                             name='input_rw_pos_x')  ########
        self.input_rw_pos_text = tf.placeholder(
            tf.string, shape=[
                None,
            ], name='input_rw_pos_text')  #######
        self.input_rw_cate = tf.placeholder(tf.float32,
                                            shape=[None, 11],
                                            name='input_rw_cate')
        self.emb_dropout_keep_prob = tf.placeholder(
            tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(
            tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name='dropout_keep_prob')

        if use_elmo:
            # Contextual Embedding Layer
            with tf.variable_scope("elmo-embeddings"):
                elmo_model = hub.Module("https://tfhub.dev/google/elmo/2",
                                        trainable=True)
                self.embedded_chars = elmo_model(self.input_text,
                                                 signature="default",
                                                 as_dict=True)["elmo"]
                self.rw_embedding = elmo_model(self.input_rw_text,
                                               signature="default",
                                               as_dict=True)["elmo"]
                self.rw_pos_embedding = elmo_model(self.input_rw_pos_text,
                                                   signature="default",
                                                   as_dict=True)["elmo"]
        else:
            # Word Embedding Layer
            with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
                self.W_text = tf.Variable(tf.random_uniform(
                    [vocab_size, embedding_size], -0.25, 0.25),
                                          name="W_text")
                self.W_rw_text = tf.Variable(tf.random_uniform(
                    [rw_vocab_size, embedding_size], -0.25, 0.25),
                                             name="W_rw_text")
                self.embedded_chars = tf.nn.embedding_lookup(
                    self.W_text, self.input_x)
                self.rw_embedding = tf.nn.embedding_lookup(
                    self.W_rw_text, self.input_rw_x)

        # Position Embedding Layer
        with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"):
            self.W_pos = tf.get_variable("W_pos",
                                         [pos_vocab_size, pos_embedding_size],
                                         initializer=initializer())
            self.p1 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p1)[:, :tf.shape(self.embedded_chars)[1]]
            self.p2 = tf.nn.embedding_lookup(
                self.W_pos,
                self.input_p2)[:, :tf.shape(self.embedded_chars)[1]]
            self.W_rw_pos_text = tf.get_variable(
                "W_rw_pos_text", [rw_pos_vocab_size, embedding_size],
                initializer=initializer())
            self.rw_pos_embedding = tf.nn.embedding_lookup(
                self.W_rw_pos_text, self.input_rw_pos_x)

        # Dropout for Word Embedding
        with tf.variable_scope('dropout-embeddings'):
            self.embedded_chars = tf.nn.dropout(self.embedded_chars,
                                                self.emb_dropout_keep_prob)
            self.rw_embedding = tf.nn.dropout(self.rw_embedding,
                                              self.emb_dropout_keep_prob)
            self.rw_pos_embedding = tf.nn.dropout(self.rw_pos_embedding,
                                                  self.emb_dropout_keep_prob)

        # Self Attention
        with tf.variable_scope("self-attention"):
            self.self_attn, self.self_alphas = multihead_attention(
                self.embedded_chars,
                self.embedded_chars,
                num_units=embedding_size,
                num_heads=num_heads)
            self.rw_pos_self_attn, self.rw_pos_self_alpha = multihead_attention2(
                self.rw_embedding,
                self.embedded_chars,
                num_units=embedding_size,
                num_heads=num_heads)

        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell,
                                                    self.rnn_dropout_keep_prob)
            _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size,
                                               initializer=initializer())
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell,
                                                    self.rnn_dropout_keep_prob)
            self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cell,
                cell_bw=bw_cell,
                inputs=self.self_attn,
                sequence_length=self._length(self.input_x),
                dtype=tf.float32)
            self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1)

        with tf.variable_scope("rw_multi-scale-cnn"):
            self.self_attn2 = tf.reshape(self.rw_pos_self_attn, [
                -1, self.rw_pos_self_attn.shape[1],
                self.rw_pos_self_attn.shape[2], 1
            ])
            conv1 = tf.layers.conv2d(inputs=self.self_attn2,
                                     filters=50,
                                     kernel_size=[1, self.self_attn2.shape[2]],
                                     padding="valid",
                                     activation=tf.nn.relu)
            pool1 = tf.keras.layers.GlobalMaxPooling2D()(conv1)
            conv2 = tf.layers.conv2d(inputs=self.self_attn2,
                                     filters=50,
                                     kernel_size=[2, self.self_attn2.shape[2]],
                                     padding="valid",
                                     activation=tf.nn.relu)
            pool2 = tf.keras.layers.GlobalMaxPooling2D()(conv2)
            conv3 = tf.layers.conv2d(inputs=self.self_attn2,
                                     filters=50,
                                     kernel_size=[3, self.self_attn2.shape[2]],
                                     padding="valid",
                                     activation=tf.nn.relu)
            pool3 = tf.keras.layers.GlobalMaxPooling2D()(conv3)
            conv4 = tf.layers.conv2d(inputs=self.self_attn2,
                                     filters=50,
                                     kernel_size=[4, self.self_attn2.shape[2]],
                                     padding="valid",
                                     activation=tf.nn.relu)
            pool4 = tf.keras.layers.GlobalMaxPooling2D()(conv4)
            self.rw_conv = tf.concat([pool1, pool2, pool3, pool4], axis=-1)

        # Attention
        with tf.variable_scope('attention1'):
            self.attn1, self.alphas, self.trans = attention1(
                self.rnn_outputs,
                self.input_e1,
                self.input_e2,
                self.p1,
                self.p2,
                attention_size=attention_size)

        # Dropout
        with tf.variable_scope('dropout'):
            #c = tf.concat([self.conv,self.rw_conv], axis=-1)
            self.h_drop1 = tf.nn.dropout(self.attn1, self.dropout_keep_prob)
            self.h_drop2 = tf.nn.dropout(self.rw_conv, self.dropout_keep_prob)

        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop1,
                                          num_classes,
                                          kernel_initializer=initializer())
            self.logits2 = tf.layers.dense(self.h_drop2,
                                           num_classes,
                                           kernel_initializer=initializer())
            self.l = tf.add(self.logits, self.logits2)
            self.dir = tf.layers.dense(self.trans,
                                       3,
                                       kernel_initializer=initializer())
            self.predictions = tf.argmax(self.l, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=self.logits, labels=self.input_y)
            losses2 = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=self.logits2, labels=self.input_y)
            self.l2 = tf.add_n(
                [tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(
                losses) + l2_reg_lambda * self.l2 + tf.reduce_mean(losses2)

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   tf.float32),
                                           name="accuracy")

Example #4

Show file

def model_fn(features, labels, mode, params):
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    x = features
    unchanged_labels = labels  # non-smoothing labels for f1 metrics

    if params.label_smooth and labels is not None:
        labels = tf.cast(labels, tf.float32)
        labels = label_smoothing(labels, epsilon=params.epsilon)

    # build embedding vectors
    vector = word_embedding(x, params.vector_path, scale=False)

    # ! reduce the fiexed word dimensions to appropriate dimension
    if params.hidden_size != vector.get_shape().as_list()[-1]:
        # 原论文中使用全连接降维
        with tf.variable_scope("dimension_reduction"):
            vector = tf.layers.dense(
                vector,
                params.hidden_size,
                activation=None,
                use_bias=False,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0))

    # scale the word embedding
    vector = vector * (params.hidden_size**0.5)

    # 给词向量 增加位置信息
    vector += position_embedding(x, num_units=params.hidden_size, scale=False)

    # # * add dropout mask vector may be not a good idea
    vector = tf.layers.dropout(vector,
                               rate=params.dropout_rate,
                               training=tf.convert_to_tensor(is_training))

    # # transformer attention stacks
    for i in range(params.num_attention_stacks):
        with tf.variable_scope(f"num_attention_stacks_{i + 1}"):
            # multi-head attention
            vector = multihead_attention(
                queries=vector,
                keys=vector,
                num_units=params.hidden_size,
                num_heads=params.num_heads,
                dropout_rate=params.dropout_rate,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0),
                is_training=is_training,
                causality=False)

            # feed forward
            vector = feedforward(
                vector,
                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                num_units=[2 * params.hidden_size, params.hidden_size])
    attentions = vector

    # 最里增加一维，以模拟一维黑白通道
    # (N, attention_stacks*T, C, 1)
    attentions = tf.expand_dims(attentions, -1)

    # ************************************************************
    # complete attention part, now CNN capture part
    # ************************************************************
    logits = []
    # 每个category对应一个inception_maxpool classifier
    for topic in range(params.multi_categories):
        cnn_features = inception(
            attentions,
            filter_size_list=params.filter_size_list,
            num_filters=params.num_filters,
            hidden_size=params.hidden_size,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0),
            scope=f"category_{topic+1}_inception"
        )  # (n, 1, 1, total_filter_num)

        total_feature_num = len(params.filter_size_list) * params.num_filters
        # cnn_features: (n, total_filter_num)
        cnn_features = tf.reshape(cnn_features, (-1, total_feature_num))

        # category_logit: (n, num_sentiment)
        category_logits = dense_logits(
            cnn_features,
            params.num_sentiment,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(1.0),
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            scope=f"category_{topic+1}_logits",
            inner_dense_outshape=params.inner_dense_outshape,
            inner_dense_activation=tf.tanh,
            use_bias=True)

        # 将该category的logit加入列表
        logits.append(category_logits)

    # logits: (n, multi_categories, num_sentiment)
    logits = tf.stack(logits, axis=1)

    # * train & eval common part
    if (mode == tf.estimator.ModeKeys.TRAIN
            or mode == tf.estimator.ModeKeys.EVAL):

        gstep = tf.train.get_or_create_global_step()

        # loss: (n, multi_categories)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,
                                                          logits=logits)
        loss = tf.reduce_sum(loss, axis=1)  # (n,)
        loss = tf.reduce_mean(loss, axis=0)  # scala

        if params.use_regularizer:
            loss_reg = sum(
                tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
            loss += params.reg_const * loss_reg
        loss = tf.identity(loss, name="loss")

        # predictions = tf.nn.softmax(logits)
        predictions = tf.cast(
            tf.equal(tf.reduce_max(logits, axis=-1, keepdims=True), logits),
            tf.float32)

        avg_macro_f1, avg_macro_f1_update_op = average_macro_f1(
            labels=tf.cast(unchanged_labels, tf.float32),
            predictions=predictions)

        eval_metric_ops = {
            'avg_macro_f1': (avg_macro_f1, avg_macro_f1_update_op)
        }

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("f1", avg_macro_f1)

        summary_hook = tf.train.SummarySaverHook(
            save_steps=params.print_n_step,
            output_dir="./summary",
            summary_op=tf.summary.merge_all())

    else:
        loss = None
        eval_metric_ops = None

    # * train specific part
    if (mode == tf.estimator.ModeKeys.TRAIN):
        learning_rate = tf.train.cosine_decay_restarts(
            learning_rate=params.learning_rate,
            global_step=gstep,
            first_decay_steps=params.first_decay_steps,
            t_mul=params.t_mul,
            m_mul=params.m_mul,
            alpha=params.alpha,
            name="learning_rate")
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=params.momentum)

        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients, params.max_norm)
        train_op = optimizer.apply_gradients(zip(gradients, variables),
                                             global_step=gstep)

        # add custom training logger
        custom_logger = _LoggerHook(loss, gstep, learning_rate,
                                    params.print_n_step)
    else:
        train_op = None

    # * predict part
    if mode == tf.estimator.ModeKeys.PREDICT:
        # 在预测时， logits：(multi_categories, num_sentiment)
        # pred: (multi_categories,)
        pred = tf.subtract(tf.argmax(logits, axis=-1), 2)
        predictions = {
            "classes": pred,
        }
        export_outputs = {
            "classify": tf.estimator.export.PredictOutput(predictions)
        }
    else:
        predictions = None
        export_outputs = None

    training_hooks = [custom_logger, summary_hook] if is_training else None

    return tf.estimator.EstimatorSpec(mode=mode,
                                      predictions=predictions,
                                      loss=loss,
                                      train_op=train_op,
                                      eval_metric_ops=eval_metric_ops,
                                      training_hooks=training_hooks)