Esempio n. 1
0
    def SelfAttentionModel(self):

        with tf.variable_scope('GOU1'):
            cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw,
                                                        cellbw,
                                                        self.signal_input,
                                                        dtype=tf.float32)
        rnn_output = tf.concat(output, 2)

        sample_encoder = encode_model.Encoder(num_layers=FLAGS.num_layer,
                                              d_model=1,
                                              num_heads=FLAGS.num_head,
                                              dff=FLAGS.num_dff)
        sample_encoder_output = sample_encoder(rnn_output,
                                               training=False,
                                               mask=None)

        concat_output = tf.concat([rnn_output, sample_encoder_output], axis=2)
        concat_output = self.layernorm1(concat_output)

        with tf.variable_scope('GOU2'):
            cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw,
                                                        cellbw,
                                                        concat_output,
                                                        dtype=tf.float32)
        rnn_output2 = tf.concat(output, 2)

        sample_encoder2 = encode_model.Encoder(num_layers=FLAGS.num_layer,
                                               d_model=1,
                                               num_heads=FLAGS.num_head,
                                               dff=FLAGS.num_dff)
        sample_encoder_output2 = sample_encoder2(rnn_output2,
                                                 training=False,
                                                 mask=None)

        concat_output2 = tf.concat([rnn_output2, sample_encoder_output2],
                                   axis=2)
        concat_output2 = self.layernorm1(concat_output2)

        with tf.variable_scope('GOU3'):
            cellfw = tf.nn.rnn_cell.GRUCell(8)
            cellbw = tf.nn.rnn_cell.GRUCell(8)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw,
                                                        cellbw,
                                                        concat_output2,
                                                        dtype=tf.float32)
        rnn_output3 = (output[0] + output[1]) / 2

        fft_signal = tf.math.l2_normalize(tf.abs(tf.signal.rfft(rnn_output3)),
                                          axis=1)
        return rnn_output3, fft_signal
Esempio n. 2
0
    def SelfAttentionModel(self):
        new_input = tf.expand_dims(self.signal_input, axis=2)

        with tf.variable_scope('GOU1'):
            cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, new_input, dtype=tf.float32)
        rnn_output = tf.concat(output, 2)
 
        sample_encoder = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1,
                                              num_heads=FLAGS.num_head,
                                              dff=FLAGS.num_dff)
        sample_encoder_output = sample_encoder(rnn_output, training=False, mask=None)

        concat_output = tf.concat([rnn_output, sample_encoder_output], axis=2)
        concat_output = self.layernorm1(concat_output)


        with tf.variable_scope('GOU2'):
            cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output, dtype=tf.float32)
        rnn_output2 = tf.concat(output, 2)

        sample_encoder2 = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1,
                                              num_heads=FLAGS.num_head,
                                              dff=FLAGS.num_dff)
        sample_encoder_output2 = sample_encoder2(rnn_output2, training=False, mask=None)

        concat_output2 = tf.concat([rnn_output2, sample_encoder_output2], axis=2)
        concat_output2 = self.layernorm1(concat_output2)

        with tf.variable_scope('GOU3'):
            cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2)
            output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output2, dtype=tf.float32)
        rnn_output3 = tf.concat(output, 2)

        
        rnn_output3 = tf.expand_dims(rnn_output3, axis=3)
        average_layer3 = tf.layers.average_pooling2d(rnn_output3, [1, rnn_output3.shape[2]], strides=[1, 1])
        squeeze_layer3 = tf.squeeze(average_layer3)
        fft_signal = tf.math.l2_normalize(tf.abs(tf.signal.rfft(squeeze_layer3)), axis=1)
        return squeeze_layer3, fft_signal
def greedy_transformer_decoder(features, labels, labels_length, params, mode):
    kernel_initializer = tf.contrib.layers.xavier_initializer(uniform=False)

    with tf.name_scope('encoder'):

        encoder = transformer.Encoder(
            params['encoder'],
            kernel_initializer=kernel_initializer,
            drop_out=lambda x: tf.layers.dropout(
                x,
                rate=params['drop_out'],
                training=mode == tf.estimator.ModeKeys.TRAIN
            ),
            num_layers=6,
            name='transformer_encoder'
        )

        encoder_outputs = encoder.apply(features)

    with tf.name_scope('decoder'):

        decoder = transformer.Decoder(
            params['decoder'],
            kernel_initializer=kernel_initializer,
            drop_out=lambda x: tf.layers.dropout(
                x,
                rate=params['drop_out'],
                training=mode == tf.estimator.ModeKeys.TRAIN
            ),
            num_layers=6,
            name='transformer_decoder'
        )

        soft_layer = tf.layers.Dense(
            units=labels.get_shape()[-1] + 1,
            kernel_initializer=kernel_initializer,
            name='softmax_output_layer'
        )

        decoded_tuple = utils.transformer_decoding(
            decoder=decoder,
            encoder_outputs=encoder_outputs,
            labels=labels,
            labels_length=labels_length,
            soft_layer=soft_layer,
            mode=mode
        )

    return decoded_tuple
Esempio n. 4
0
    def build_model(self, word_inputs, char_inputs, labels, seq_len, char_len,
                    num_train_steps, char_mode):
        print("Building model!")
        if (char_mode == "no_char"):
            self.model_dim /= 2

        # Implements linear decay of the learning rate.
        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.polynomial_decay(self.learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        encoder = transformer.Encoder(num_layers=self.num_layers,
                                      num_heads=self.num_heads,
                                      linear_key_dim=self.linear_key_dim,
                                      linear_value_dim=self.linear_value_dim,
                                      model_dim=self.model_dim,
                                      ffn_dim=self.ffn_dim,
                                      dropout=self.dropout,
                                      n_class=self.n_class,
                                      batch_size=self.batch_size)
        encoder_emb = self.build_embed(word_inputs, char_inputs, char_len,
                                       char_mode)
        encoder_outputs = encoder.build(encoder_emb, seq_len)

        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=encoder_outputs, labels=labels))  # Softmax loss
        optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(
                loss, global_step=global_step)  # Adam Optimizer

        return loss, optimizer, encoder_outputs
Esempio n. 5
0
            return tf.tensordot(inputs, tf.transpose(self.w), 1) + self.b


# input for the keras model
tokens = tf.keras.layers.Input(shape=(seq_length, ), dtype='int32')

# instantiates a tied softmax class
tied_embedding_softmax = TiedEmbeddingSoftmax()

# embedded tokens, before passing it to the transformer
embedded = tied_embedding_softmax(tokens, embed=True)

# the activations after passing it from the transformer
# for some odd reason, TPUs don't play well with specifying the arguments of the Encoder() function
# so you have to leave them at their defaults
transformed = transformer.Encoder()(embedded, training=False)

# pass the activations from our tiedsoftmax class
# this time with embed=False denoting that we are doing the softmax operation
# and not a lookup
logits = tied_embedding_softmax(transformed, embed=False)

# finally, define the Keras model with inputs as tokens and outputs as the logits we just computed
model = tf.keras.Model(inputs=tokens, outputs=logits)


# the loss function is a simple categorical crossentropy between the logits and the labels
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                           logits,
                                                           from_logits=True)
def main():
    
    # fetch the training data
    data, vocab = prepare_data()
    
    # create the word embeddings and positional embeddings (we learn both of them)
    word_emb = nn.Embedding(len(vocab), EMBEDDING_SIZE)
    pos_emb = nn.Embedding(len(data[0]), EMBEDDING_SIZE)
    
    # turn the dataset into a tensor of word indices
    data = torch.LongTensor([[vocab[word] for word in sample] for sample in data])
    
    # create the encoder, the pretraining loss, and the optimizer
    encoder = transformer.Encoder(
            NUM_LAYERS,  # num_layers
            NUM_HEADS,  # num_heads
            *DIMENSIONS,  # dim_model / dim_keys / dim_values
            DROPOUT_RATE,  # residual_dropout
            DROPOUT_RATE,  # attention_dropout
            PAD.index  # pad_index
    )
    loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
            itertools.chain(encoder.parameters(), word_emb.parameters(), pos_emb.parameters()),
            lr=LEARNING_RATE
    )
    
    # move to GPU, if possible
    if GPU:
        data = data.cuda()
        encoder.cuda()
        word_emb.cuda()
        pos_emb.cuda()

    # create a mask that ensures that no future steps can be used
    mask = util.create_shifted_output_mask(data)[:, :-1, :-1]  # -> cut off final time step, which is never an input

    # create a tensor of indices, which is used to retrieve the according positional embeddings below
    index_seq = data.new(range(data.size(1) - 1)).unsqueeze(0).expand(data.size(0), -1)
    
    # pretrain the encoder
    for epoch in range(NUM_EPOCHS):
        
        # embed input sequence + add positional embeddings
        input_seq = word_emb(data[:, :-1]) + pos_emb(index_seq)
        
        # encode the input sequence
        enc = encoder(input_seq, mask)
        
        # compute (unnormalized) next-word predictions from the encoded input sequences
        logits = enc.matmul(word_emb.weight.transpose(0, 1))
        
        # compute the loss
        optimizer.zero_grad()
        current_loss = loss(logits.view(-1, logits.size(-1)), data[:, 1:].contiguous().view(-1))
        print(f"EPOCH {epoch + 1:>3}:  LOSS = {current_loss.item()}")
        
        # update the model
        current_loss.backward()
        optimizer.step()

    # evaluate the probabilities of the training samples
    encoder.eval()
    input_seq = word_emb(data[:, :-1]) + pos_emb(index_seq)
    enc = encoder(input_seq, mask)
    log_probs = torch.log_softmax(enc.matmul(word_emb.weight.transpose(0, 1)), 2)
    sample_probs = []
    for sample_idx, sample_log_probs in enumerate(log_probs):
        sample_data = data[sample_idx][1:].unsqueeze(1)
        sample_log_probs = sample_log_probs.gather(1, sample_data) * (sample_data != PAD.index).float()
        sample_probs.append(sample_log_probs.sum().exp().item())
    print("\nSAMPLE PROBABILITIES:")
    for p in sample_probs:
        print("*", p)
Esempio n. 7
0
def main():

    # fetch the training data
    data, vocab = prepare_data()

    # create the word embeddings with word2vec and positional embeddings
    emb_model = word2vec.Word2Vec(
            sentences=data,
            size=EMBEDDING_SIZE,
            min_count=1
    )
    for word in vocab.keys():
        if word not in emb_model.wv:
            emb_model.wv[word] = np.zeros((EMBEDDING_SIZE,))
    word_emb_mat = nn.Parameter(
        data=torch.FloatTensor([emb_model[word] for word in vocab.keys()]),
        requires_grad=False
    )
    word_emb = nn.Embedding(len(vocab), EMBEDDING_SIZE)
    word_emb.weight = word_emb_mat
    pos_emb = nn.Embedding(len(data[0]), EMBEDDING_SIZE)
    pos_emb.weight.require_grad = True

    # turn the dataset into a tensor of word indices
    data = torch.LongTensor([[vocab[word] for word in sample] for sample in data])
    
    # create the encoder, the pretraining loss, and the optimizer
    encoder = transformer.Encoder(
            NUM_LAYERS,  # num_layers
            NUM_HEADS,  # num_heads
            *DIMENSIONS,  # dim_model / dim_keys / dim_values
            DROPOUT_RATE,  # residual_dropout
            DROPOUT_RATE,  # attention_dropout
            PAD.index  # pad_index
    )
    loss = bert.MLMLoss(
            encoder,
            word_emb,
            pos_emb,
            MASK.index
    )
    optimizer = optim.Adam(
            itertools.chain(encoder.parameters(), loss.parameters()),
            lr=LEARNING_RATE
    )

    # move to GPU, if possible
    if GPU:
        data = data.cuda()
        encoder.cuda()
        loss.cuda()  # -> also moves embeddings to the GPU

    # pretrain the encoder
    for epoch in range(NUM_EPOCHS):
        
        # compute the loss
        optimizer.zero_grad()
        current_loss = loss(data)
        print("EPOCH", epoch + 1, ": LOSS =", current_loss.item())
        
        # update the model
        current_loss.backward()
        optimizer.step()
Esempio n. 8
0

# input for the keras model
tokens = tf.keras.layers.Input(shape=(seq_length, ),
                               dtype='int32')  # 就是tf里面的placeholder

# instantiates a tied softmax class
tied_embedding_softmax = TiedEmbeddingSoftmax()

# embedded tokens, before passing it to the transformer
embedded = tied_embedding_softmax(tokens, embed=True)

# the activations after passing it from the transformer
# for some odd reason, TPUs don't play well with specifying the arguments of the Encoder() function
# so you have to leave them at their defaults
transformed = transformer.Encoder()(embedded,
                                    training=False)  # 直接类加变量,表示调用里面的call函数.

# pass the activations from our tiedsoftmax class
# this time with embed=False denoting that we are doing the softmax operation
# and not a lookup
logits = tied_embedding_softmax(transformed, embed=False)

# finally, define the Keras model with inputs as tokens and outputs as the logits we just computed
model = tf.keras.Model(inputs=tokens, outputs=logits)


# the loss function is a simple categorical crossentropy between the logits and the labels
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                           logits,
                                                           from_logits=True)