Esempio n. 1
0
 def total_parameters_transformer(share_parameter_across_layers):
     input_tensor = tf.zeros((batch_size, sequence_length, hidden_size),
                             dtype=tf.float32)
     print("transformer_model. input:", input_tensor)
     transformer_result = transformer_model(
         input_tensor,
         hidden_size=hidden_size,
         num_attention_heads=num_attention_heads,
         share_parameter_across_layers=share_parameter_across_layers)
     print("transformer_result:", transformer_result)
     total_parameters = get_total_parameters()
     print('total_parameters(not share):', total_parameters)
Esempio n. 2
0
    def build_encoder(self, features):
        hparams = self.hparams

        # Here we expect features to have 'sequence' and 'attention_mask'
        with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
            # import pdb; pdb.set_trace()
            sequence = features['sequence']  # [batch, seq_len=128]
            # types of entity: Point, Line, Segment, Halfplane, etc.
            embedding_output, _ = modeling.embedding_lookup(
                input_ids=sequence,
                vocab_size=hparams.entity_num_type,
                embedding_size=hparams.hidden_size,
                initializer_range=hparams.initializer_range,
                word_embedding_name='entity_type_embedding',
            )  # [batch, seq_len, hid_size]

            # Next we add a "type" to indicate which
            # object in the sequence is of problem state, and
            # which is the goal object.
            encoder_input = modeling.embedding_postprocessor(
                input_tensor=embedding_output,
                sequence_ids=sequence,
                hparams=self.hparams)  # [batch, seq_len, hid_size]

        # Next we feed the sequence into encoder transformer
        # with the corresponding attention mask.
        with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
            # [batch, seq_len, seq_len]
            attention_mask = dec_to_bin_att_mask(features['attention_mask'])
            all_encoder_layers = modeling.transformer_model(
                input_tensor=encoder_input,  # [batch, seq_len, hid_size]
                attention_mask=attention_mask,  # [batch, seq_len, seq_len]
                hidden_size=hparams.hidden_size,
                num_hidden_layers=hparams.num_encode_layers,
                num_attention_heads=hparams.num_attention_heads,
                intermediate_size=hparams.intermediate_size,
                intermediate_act_fn=modeling.get_activation(
                    hparams.hidden_act),
                hidden_dropout_prob=hparams.dropout_prob,
                attention_probs_dropout_prob=hparams.dropout_prob,
                initializer_range=hparams.initializer_range,
                do_return_all_layers=True,
                attention_top_k=hparams.attention_top_k,
                densify_attention_mask=hparams.densify_attention_mask)

        sequence_output, attention_weights = all_encoder_layers[
            -1]  # [batch seq_len hid_size]
        cls_vector = sequence_output[:, 0:1, :]  # [batch 1 hid_size]

        return sequence_output, cls_vector, attention_weights
Esempio n. 3
0
def transformer(input_q, input_d, len_q, len_d):
    """Use the transformer code from google BERT
    """
    with tf.variable_scope("embed_q"):
        raw_mask_q = tf.cast(tf.sequence_mask(len_q), tf.float32)
        attention_mask_q = create_attention_mask_from_input_mask(
            from_tensor=input_q, to_mask=raw_mask_q)
        embed_q_all = transformer_model(input_tensor=input_q,
                                        attention_mask=attention_mask_q,
                                        hidden_size=64,
                                        num_hidden_layers=4,
                                        num_attention_heads=2,
                                        intermediate_size=128,
                                        intermediate_act_fn=gelu,
                                        hidden_dropout_prob=0.1,
                                        attention_probs_dropout_prob=0.1,
                                        initializer_range=0.02,
                                        do_return_all_layers=True)
        embed_q = embed_q_all[-1]

    with tf.variable_scope("embed_d"):
        raw_mask_d = tf.cast(tf.sequence_mask(len_d), tf.float32)
        attention_mask_d = create_attention_mask_from_input_mask(
            from_tensor=input_d, to_mask=raw_mask_d)
        embed_d_all = transformer_model(input_tensor=input_d,
                                        attention_mask=attention_mask_d,
                                        hidden_size=64,
                                        num_hidden_layers=4,
                                        num_attention_heads=2,
                                        intermediate_size=128,
                                        intermediate_act_fn=gelu,
                                        hidden_dropout_prob=0.1,
                                        attention_probs_dropout_prob=0.1,
                                        initializer_range=0.02,
                                        do_return_all_layers=True)
        embed_d = embed_d_all[-1]
    return embed_q, embed_d
    def __init__(self, config, input_embedding, attention_mask):
        # Keep variable names the same as BERT
        with tf.variable_scope("bert"):
            with tf.variable_scope("encoder"):
                all_encoder_layers = modeling.transformer_model(
                    input_tensor=input_embedding,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

                self.sequence_output = all_encoder_layers[-1]
    def feed_neural_work(self):
        '''
        input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False'''
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        self.all_encoder_layers, self.context_bias = modeling.transformer_model(
            self.embedded_chars_q,
            attention_mask=self.attention_mask,
            hidden_size=self.config.hidden_size,
            num_hidden_layers=self.config.num_hidden_layers,
            num_attention_heads=self.config.num_attention_heads,
            intermediate_size=self.config.intermediate_size,
            intermediate_act_fn=modeling.get_activation(
                self.config.hidden_act),
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            initializer_range=self.config.initializer_range,
            do_return_all_layers=True,
            t5_relative_bias=self.t5_att_bias)
        self.sequence_output = self.all_encoder_layers[-1]
        # The "pooler" converts the encoded sequence tensor of shape
        # [batch_size, seq_length, hidden_size] to a tensor of shape
        # [batch_size, hidden_size]. This is necessary for segment-level
        # (or segment-pair-level) classification tasks where we need a fixed
        # dimensional representation of the segment.
        with tf.variable_scope("pooler"):
            # We "pool" the model by simply taking the hidden state corresponding
            # to the first token. We assume that this has been pre-trained

            if self.transformer_ret_pooling == "mean":
                print('self.seq_lent:', self.seq_lent)
                print('tf.reduce_sum(self.sequence_output,axis=1):',
                      tf.reduce_sum(self.sequence_output, axis=1))

                self.pooled_output = tf.reduce_sum(self.sequence_output,
                                                   axis=1) * self.seq_lent
            elif self.transformer_ret_pooling == "last":
                self.pooled_output = self.sequence_output[:, -1, :]
            elif self.transformer_ret_pooling == "max":
                self.pooled_output = tf.reduce_max(self.sequence_output,
                                                   axis=1)
            else:
                print('wrong transformer_ret_pooling:',
                      self.transformer_ret_pooling)
                exit(0)

            if 'adding_problem' not in self.dataset:
                #we add dropout for pooled_output
                self.pooled_output = modeling.layer_norm(
                    tf.nn.dropout(self.pooled_output,
                                  keep_prob=1.0 - self.input_dropout_prob))

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_size, self.max_input_right],
                initializer=initializer())
            b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]),
                            name="b")
            l2_loss = tf.constant(0.0)
            l2_loss += tf.nn.l2_loss(W)
            self.scores = tf.nn.xw_plus_b(self.pooled_output,
                                          W,
                                          b,
                                          name="scores")
            print(self.scores)

            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        if 'adding_problem' not in self.dataset:
            # Calculate mean cross-entropy loss
            with tf.name_scope("loss"):
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)
                self.l2_loss = l2_loss * self.l2_reg_lambda
                self.loss = tf.reduce_mean(losses) + self.l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
        else:
            with tf.name_scope("loss"):
                losses = tf.nn.l2_loss(self.scores -
                                       tf.expand_dims(self.input_y, -1))
                print('losses:', losses)

                self.l2_loss = self.l2_reg_lambda * l2_loss
                self.loss = tf.reduce_mean(losses) + self.l2_loss * 1e-3

            with tf.name_scope("accuracy"):
                correct_predictions = tf.less_equal(
                    tf.abs(self.scores[:, 0] - self.input_y),
                    tf.constant([0.04]))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
Esempio n. 6
0
    def body(self, features):
        hparams = self.hparams
        if not self.is_training:
            hparams.dropout_prob = 0.0

        with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
            # attention_weights: [batch, n_head, from_len, to_len]
            sequence_output, cls_vector, attention_weights = self.build_encoder(
                features)

        if 'targets' not in features:
            assert self.hparams.dropout_prob == 0.0
            logits, losses = self.greedy_decode_8steps(cls_vector,
                                                       sequence_output)
            logits.update(attention_weights=attention_weights[:, :, 0, :])
            return logits, losses

        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE):
                premise = features[
                    'targets']  # [batch, premise_len=8] -bad naming:(
                # [batch, premise_len, hid_size]
                premise_vecs = premise_gather_nd(sequence_output, premise)

                batch_size = tf.shape(premise)[0]
                premise_len = premise.shape.as_list()[-1]
                theorem = features['theorem']  # batch, 1

                # [batch, 1, hid_size] and [num_theorems, hid_size]
                theorem_vec, theorem_emb_table = modeling.embedding_lookup(
                    input_ids=theorem,  # [batch, 1]
                    vocab_size=hparams.num_theorems,
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='theorem_embedding',
                )
                depth = features['depth']  # batch, 1

                decoder_input = tf.concat(
                    [
                        cls_vector,  # [batch, 1, hid_size]
                        theorem_vec,  # [batch, 1, hid_size]
                        premise_vecs[:, :
                                     -1, :]  # [batch, premise_len-1, hid_size]
                    ],
                    axis=1)  # [batch, premise_len + 1, hid_size]
                decode_length = decoder_input.shape.as_list()[1]
                assert decode_length == premise_len + 1

                # [decode_length, hid_size]
                pos_embedding, _ = modeling.embedding_lookup(
                    input_ids=tf.range(decode_length),  # [decode_length]
                    vocab_size=hparams.max_premise,  # >= premise_len
                    embedding_size=hparams.hidden_size,
                    initializer_range=hparams.initializer_range,
                    word_embedding_name='positional_embedding',
                )
                pos_embedding = tf.reshape(
                    pos_embedding, [1, decode_length, hparams.hidden_size])

                decoder_input = modeling.layer_norm_and_dropout(
                    decoder_input +  # [batch, decode_length, hid_size]
                    pos_embedding,  # [1,     decode_length, hid_size]
                    hparams.dropout_prob)  # [batch, decode_length, hid_size]

            with tf.variable_scope('transformer', reuse=tf.AUTO_REUSE):
                causal_attention_mask = t2t_model.common_layers.ones_matrix_band_part(
                    rows=decode_length,
                    cols=decode_length,
                    num_lower=-1,  # attend to everything before
                    num_upper=0,  # attend to nothing after
                    out_shape=[1, decode_length, decode_length
                               ])  # 1, decode_length, decode_length

                # [batch, decode_length, decode_length]
                causal_attention_mask = tf.tile(causal_attention_mask,
                                                [batch_size, 1, 1])

                all_decoder_layers = modeling.transformer_model(
                    input_tensor=decoder_input,
                    attention_mask=causal_attention_mask,
                    hidden_size=hparams.hidden_size,
                    num_hidden_layers=hparams.num_decode_layers,
                    num_attention_heads=hparams.num_attention_heads,
                    intermediate_size=hparams.intermediate_size,
                    intermediate_act_fn=modeling.get_activation(
                        hparams.hidden_act),
                    hidden_dropout_prob=hparams.dropout_prob,
                    attention_probs_dropout_prob=hparams.dropout_prob,
                    initializer_range=hparams.initializer_range,
                    do_return_all_layers=True,
                    attention_top_k=hparams.attention_top_k)

                decoder_output, _ = all_decoder_layers[
                    -1]  # [batch, dec_len, hid_size]
                theorem_feature = decoder_output[:, 0, :]  # [batch, hid_size]
                premise_feature = decoder_output[:,
                                                 1:, :]  # [batch, tar_len, hid_size]

        with tf.variable_scope('prediction', reuse=tf.AUTO_REUSE):
            theorem_logits = tf.keras.layers.Dense(  # [batch, num_theorems]
                name='theorem',
                units=hparams.num_theorems,
                use_bias=True,
                kernel_initializer=modeling.create_initializer(
                    hparams.initializer_range))(theorem_feature)

            premise_logits = tf.matmul(
                a=premise_feature,  # [batch, premise_len, hid_size]
                b=sequence_output,  # [batch, sequence_len, hid_size]
                transpose_b=True,
            )  # [batch, premise_len, sequence_len]

            # [batch * premise_len, sequence_len]
            seq_len = premise_logits.shape.as_list()[-1]
            premise_logits = tf.reshape(premise_logits, [-1, seq_len])

            premise_weights = tf.cast(premise > 0,
                                      tf.float32)  # [batch, prem_len]
            premise_weights = tf.reshape(premise_weights,
                                         [-1])  # [batch * prem_len]
            premise = tf.reshape(premise, [-1, 1])  # [batch * prem_len, 1]

            theorem_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=theorem,  # [batch, 1]
                logits=theorem_logits  # [batch, num_theorems]
            )
            premise_loss = tf.losses.sparse_softmax_cross_entropy(
                labels=premise,  # [batch * premise_len, 1]
                logits=premise_logits,  # [batch * premise_len, sequence_len]
                weights=premise_weights  # [batch * premise_len]
            )

            logits = dict(theorem_logits=theorem_logits,
                          theorem_labels=theorem,
                          premise_logits=premise_logits,
                          premise_labels=premise)

            losses = dict(training=theorem_loss + premise_loss,
                          theorem_loss=theorem_loss,
                          premise_loss=premise_loss)

        return logits, losses
Esempio n. 7
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,#たぶんこの設定にしたがってbertを呼び出すということ
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()#Bertの最終層

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])

  #ここをTransformerにする
  """
  output_weights = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)
  """
  #Transformer層
  #bertの中のtransformerよりずっとスペック低くしている
  transformer_outputs = modeling.transformer_model(input_tensor=final_hidden_matrix,
                              attention_mask=None,
                              hidden_size=5,
                              num_hidden_layers=2,
                              num_attention_heads=2,
                              intermediate_size=20,
                              intermediate_act_fn=modeling.gelu,
                              hidden_dropout_prob=0.1,
                              attention_probs_dropout_prob=0.1,
                              initializer_range=0.02,
                              do_return_all_layers=False)#現状Falseのみ

  #線型層
  output_weights = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_weights", [30000, 5],
      initializer=tf.truncated_normal_initializer(stddev=0.02))
  output_bias = tf.get_variable(
      #/はスコープの区切りを表す。だからcls/squad/output_weightsはclsスコープのsquadスコープのoutput_weightsという変数を表す
      #変数がないときは定義し、ある時はそれを呼び出す
      "cls/squad/output_bias", [30000], initializer=tf.zeros_initializer())
  logits = tf.matmul(transformer_outputs, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  #max
  ids = tf.reduce_max(logits,axis=0)

  #Transformerのテンソルとidを出力。損失を測るのに両方使うため
  return (ids,transformer_outputs)
    def feed_neural_work(self):
        '''
        input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False'''
        # `sequence_output` shape = [batch_size, seq_length, hidden_size].
        self.all_encoder_layers, self.context_bias = modeling.transformer_model(
            self.embedded_chars_q,
            attention_mask=self.attention_mask,
            hidden_size=self.config.hidden_size,
            num_hidden_layers=self.config.num_hidden_layers,
            num_attention_heads=self.config.num_attention_heads,
            intermediate_size=self.config.intermediate_size,
            intermediate_act_fn=modeling.get_activation(
                self.config.hidden_act),
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            initializer_range=self.config.initializer_range,
            do_return_all_layers=True,
            t5_relative_bias=self.t5_att_bias)
        self.sequence_output = self.all_encoder_layers[-1]

        with tf.variable_scope("pooler"):
            if self.transformer_ret_pooling == "mean":
                print('self.seq_lent:', self.seq_lent)
                print('tf.reduce_sum(self.sequence_output,axis=1):',
                      tf.reduce_sum(self.sequence_output, axis=1))

                self.pooled_output = tf.reduce_sum(self.sequence_output,
                                                   axis=1) * self.seq_lent
            elif self.transformer_ret_pooling == "last":
                self.pooled_output = self.sequence_output[:, -1, :]
            elif self.transformer_ret_pooling == "max":
                self.pooled_output = tf.reduce_max(self.sequence_output,
                                                   axis=1)
            else:
                print('wrong transformer_ret_pooling:',
                      self.transformer_ret_pooling)
                exit(0)

            #we add dropout for pooled_output
            if 'adding_problem' not in self.dataset:
                self.pooled_output = modeling.layer_norm(
                    tf.nn.dropout(self.pooled_output,
                                  keep_prob=1.0 - self.input_dropout_prob))

        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[self.config.hidden_size, self.max_input_right],
                initializer=initializer(),
            )
            b = tf.Variable(tf.constant(0.1, shape=[self.max_input_right]),
                            name="b")
            l2_loss = tf.constant(0.0)
            l2_loss += tf.nn.l2_loss(W)

            self.scores = tf.nn.xw_plus_b(self.pooled_output,
                                          W,
                                          b,
                                          name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        if 'adding_problem' not in self.dataset:
            # Calculate mean cross-entropy loss
            with tf.name_scope("loss"):
                self.l2_loss = self.l2_reg_lambda * l2_loss
                losses = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.scores, labels=self.input_y)

                self.loss = tf.reduce_mean(losses)  #+ self.l2_loss
            # Accuracy
            with tf.name_scope("accuracy"):
                correct_predictions = tf.equal(self.predictions,
                                               tf.argmax(self.input_y, 1))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
        else:
            with tf.name_scope("loss"):
                self.l2_loss = self.l2_reg_lambda * l2_loss
                losses = tf.nn.l2_loss(self.scores -
                                       tf.expand_dims(self.input_y, -1))
                print('losses:', losses)
                self.loss = tf.reduce_mean(losses)  #+ self.l2_loss

            with tf.name_scope("accuracy"):
                correct_predictions = tf.less_equal(
                    tf.abs(self.scores[:, 0] - self.input_y),
                    tf.constant([0.04]))
                self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                       "float"),
                                               name="accuracy")
Esempio n. 9
0
def main(args):
    bert_config = modeling.BertConfig.from_json_file(args.config)
    bert_config.hidden_dropout_prob = 0.0
    bert_config.attention_probs_dropout_prob = 0.0

    batch_size = args.batch_size
    avg_seq_len = args.avg_seq_length
    max_seq_len = args.max_seq_length
    tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32

    # fake input array length
    input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len,
                                  high=max_seq_len + 1,
                                  size=(batch_size),
                                  dtype=np.int32)
    valid_word_num = sum(input_len)

    # fake input id and mask
    input_ids = np.random.randint(low=0,
                                  high=bert_config.vocab_size,
                                  size=(batch_size, max_seq_len),
                                  dtype=np.int32)
    input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32)
    for b_idx, s_len in enumerate(input_len):
        input_mask[b_idx][:s_len] = 1

    input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32)
    input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32)

    # fake embedding output
    embed_output = np.random.randn(batch_size, max_seq_len,
                                   bert_config.hidden_size)
    input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype)

    # keep attention_mask for compatible reason
    att_mask = np.tile(input_mask, max_seq_len)
    att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len)
    attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype)

    # input info
    valid_word_num = sum(input_len)
    print("Valid word num : {}/{}, avg sequence length : {:.6} ".format(
        valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size))

    # bert with standard transformer
    std_bert = modeling.transformer_model(
        input_tensor=input_tensor,
        attention_mask=attention_mask,
        hidden_size=bert_config.hidden_size,
        num_hidden_layers=bert_config.num_hidden_layers,
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_act_fn=modeling.get_activation(bert_config.hidden_act),
        hidden_dropout_prob=bert_config.hidden_dropout_prob,
        attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
        initializer_range=bert_config.initializer_range,
        do_return_all_layers=False)

    config = tf.ConfigProto()
    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    with tf.Session(config=config) as sess:
        # init weights
        sess.run(tf.global_variables_initializer())

        # get transformer weights
        all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        transformer_vars = [v for v in all_vars if v.name.startswith('layer')]
        weights_value = sess.run(transformer_vars)

        # bert with effective transformer
        et_bert = effective_transformer.get_sequence_output(
            max_batch_size=batch_size,
            max_seq_length=max_seq_len,
            config=bert_config,
            attention_mask=attention_mask,
            input_mask=input_mask_tensor,
            from_tensor=input_tensor,
            weights_value=weights_value,
        )

        # diff
        val1 = sess.run(std_bert).reshape(-1, 768)
        val2 = sess.run(et_bert).reshape(-1, 768)
        diff = []
        for b_idx, s_len in enumerate(input_len):
            for w_idx in range(s_len):
                idx = b_idx * args.max_seq_length + w_idx
                diff.append(np.fabs(val1[idx] - val2[idx]).max())
        print("max diff : {:.6}, avg diff : {:.6}.".format(
            max(diff),
            sum(diff) / len(diff)))

        def time_inference(output_tensor):
            iter_num = 128
            # warm up
            for i in range(10):
                sess.run(output_tensor)

            beg = datetime.now()
            for i in range(iter_num):
                sess.run(output_tensor)
            end = datetime.now()
            return (end - beg).total_seconds() * 1000 / iter_num  # ms

        print("xla cost : {:.6} ms".format(time_inference(std_bert)))
        print("et  cost : {:.6} ms".format(time_inference(et_bert)))
Esempio n. 10
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     mix_number=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     groups=None,
                     expansion=None,
                     drop_rate=None,
                     gating_reduction=None,
                     **unused_params):
        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)

        config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
        config = copy.deepcopy(config)

        config.num_hidden_layers = FLAGS.bert_hidden_layer
        config.num_attention_heads = FLAGS.bert_attention_heads
        config.hidden_dropout_prob = FLAGS.bert_dropout_prob
        config.attention_probs_dropout_prob = FLAGS.bert_dropout_prob

        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        #breakpoint()
        with tf.variable_scope("encoder"):
            self.all_encoder_layers = modeling.transformer_model(
                input_tensor=model_input,
                attention_mask=None,
                hidden_size=config.hidden_size,
                num_hidden_layers=config.num_hidden_layers,
                num_attention_heads=config.num_attention_heads,
                intermediate_size=config.intermediate_size,
                intermediate_act_fn=modeling.get_activation(config.hidden_act),
                hidden_dropout_prob=config.hidden_dropout_prob,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_all_layers=True)

        model_input = self.all_encoder_layers[-1]

        if FLAGS.sample_random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   FLAGS.iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     FLAGS.iterations)

        cluster_size = cluster_size or FLAGS.nextvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.nextvlad_hidden_size
        gating_reduction = gating_reduction or FLAGS.gating_reduction
        groups = groups or FLAGS.groups
        drop_rate = drop_rate or FLAGS.drop_rate
        mix_number = mix_number or FLAGS.mix_number
        expansion = expansion or FLAGS.expansion

        max_frames = model_input.get_shape().as_list()[1]
        mask = tf.sequence_mask(FLAGS.iterations, max_frames, dtype=tf.float32)

        ftr_mean = tf.reduce_mean(model_input, axis=-1)
        ftr_mean = slim.batch_norm(ftr_mean,
                                   center=True,
                                   scale=True,
                                   fused=True,
                                   is_training=is_training,
                                   scope="mix_weights_bn")
        mix_weights = slim.fully_connected(
            ftr_mean,
            mix_number,
            activation_fn=None,
            weights_initializer=slim.variance_scaling_initializer(),
            scope="mix_weights")
        mix_weights = tf.nn.softmax(mix_weights, axis=-1)
        tf.summary.histogram("mix_weights", mix_weights)

        results = []
        for n in range(mix_number):
            with tf.variable_scope("branch_%d" % n):
                res = self.nextvlad_model(video_ftr=model_input[:, :, 0:1024],
                                          audio_ftr=model_input[:, :, 1024:],
                                          vocab_size=vocab_size,
                                          max_frames=max_frames,
                                          cluster_size=cluster_size,
                                          groups=groups,
                                          expansion=expansion,
                                          drop_rate=drop_rate,
                                          hidden1_size=hidden1_size,
                                          is_training=is_training,
                                          gating_reduction=gating_reduction,
                                          mask=mask,
                                          **unused_params)
                results.append(res)

        aux_preds = [res["predictions"] for res in results]
        logits = [res["logits"] for res in results]
        logits = tf.stack(logits, axis=1)

        mix_logit = tf.reduce_sum(tf.multiply(tf.expand_dims(mix_weights, -1),
                                              logits),
                                  axis=1)

        pred = tf.nn.sigmoid(mix_logit)

        if is_training:
            rank_pred = tf.expand_dims(tf.nn.softmax(tf.div(
                mix_logit, FLAGS.cl_temperature),
                                                     axis=-1),
                                       axis=1)
            aux_rank_preds = tf.nn.softmax(tf.div(logits,
                                                  FLAGS.cl_temperature),
                                           axis=-1)
            epsilon = 1e-8
            kl_loss = tf.reduce_sum(rank_pred *
                                    (tf.log(rank_pred + epsilon) -
                                     tf.log(aux_rank_preds + epsilon)),
                                    axis=-1)

            regularization_loss = FLAGS.cl_lambda * tf.reduce_mean(
                tf.reduce_sum(kl_loss, axis=-1), axis=-1)

            return {
                "predictions": pred,
                "regularization_loss": regularization_loss,
                "aux_predictions": aux_preds
            }
        else:
            return {"predictions": pred}
Esempio n. 11
0
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  use_cnn = True
  use_attention = False
  use_transformer = False
  use_dense = False

  if use_cnn:
    
    nb_channels = 3
    width = int(math.sqrt(hidden_size / nb_channels))
    height = width

    output_weights = tf.get_variable(
        "cls/squad/output_weights", [2,width*height],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    filters = tf.get_variable(
        "cls/squad/filters", [3,3,3,1],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    final_hidden_matrix = tf.reshape(final_hidden,
                                  [batch_size * seq_length, width,width,3])
    
    final_hidden_matrix = tf.nn.conv2d(input = final_hidden_matrix,filter=filters,strides=[1, 1, 1, 1],padding="SAME")

    final_hidden_matrix  = tf.reshape(final_hidden_matrix,[batch_size ,seq_length,width*height])

    with tf.variable_scope("attention_after_conv2D"):
      modeling.attention_layer(final_hidden_matrix,final_hidden_matrix)

    final_hidden_matrix  = tf.reshape(final_hidden_matrix,[batch_size *seq_length,width*height])

    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
    
  if use_attention:
    with tf.variable_scope("layer_custom_%d" % 1):
      modeling.attention_layer(final_hidden,final_hidden)
    with tf.variable_scope("layer_custom_%d" % 2):
      modeling.attention_layer(final_hidden,final_hidden)
    with tf.variable_scope("layer_custom_%d" % 3):
      modeling.attention_layer(final_hidden,final_hidden)

  
  if use_transformer:
    with tf.variable_scope("custom_transformer"):
      final_hidden = modeling.transformer_model(final_hidden,num_hidden_layers=3,num_attention_heads=3)
  

  
  if use_attention or use_transformer:

    output_weights = tf.get_variable(
        "cls/squad/output_weights", [2, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))
    
    final_hidden_matrix = tf.reshape(final_hidden,
                                    [batch_size * seq_length, hidden_size])
    logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)

  output_bias = tf.get_variable(
        "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits)