예제 #1
0
 def _encoder_decoder_attention(self, q, k, v, bias):
     with tf.variable_scope("encoder-decoder-attention"):
         attention = Attention(num_heads=self.num_heads,
                               mode="encoder-decoder-attention",
                               linear_key_dim=self.linear_key_dim,
                               linear_value_dim=self.linear_value_dim,
                               model_dim=self.model_dim,
                               dropout=self.dropout)
         return attention.multi_head(q, k, v, bias)
예제 #2
0
 def _self_attention(self, q, k, v, seq_len):
     with tf.variable_scope("self-attention"):
         attention = Attention(num_heads=self.num_heads,
                               mode="encoder",
                               linear_key_dim=self.linear_key_dim,
                               linear_value_dim=self.linear_value_dim,
                               model_dim=self.model_dim,
                               dropout=self.dropout)
         return attention.multi_head(q, k, v, seq_len)
예제 #3
0
 def _masked_self_attention(self, q, k, v, bias):
     with tf.variable_scope("masked-self-attention"):
         attention = Attention(num_heads=self.num_heads,
                               mode="masked-self-attention",
                               linear_key_dim=self.linear_key_dim,
                               linear_value_dim=self.linear_value_dim,
                               model_dim=self.model_dim,
                               dropout=self.dropout)
         return attention.multi_head(q, k, v, bias)
예제 #4
0
 def _self_attention(self, q, k, v, future, sos, seq_len):
     with tf.variable_scope("self-attention"):
         attention = Attention(num_heads=self.num_heads,
                               masked=True,
                               linear_key_dim=self.linear_key_dim,
                               linear_value_dim=self.linear_value_dim,
                               model_dim=self.model_dim,
                               dropout=self.dropout,
                               batch_size=self.batch_size)
         return attention.multi_head(q, k, v, future, sos, seq_len)
예제 #5
0
    def define_layer(self):
        # Different placeholders
        with tf.name_scope('Input'):
            self.batch_ph = tf.placeholder(tf.int32,[None,self.sequence_length],name='batch_ph')
            self.target_ph = tf.placeholder(tf.float32,[None],name='target_ph')
            self.keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')
            
        # Embedding layer
        with tf.name_scope('Embedding_layer'):
            self.embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size,self.embedding_dim],-1.0,1.0),trainable=True)
            tf.summary.histogram('embeddings_var',self.embeddings_var)
            self.batch_embedded = tf.nn.embedding_lookup(self.embeddings_var,self.batch_ph)
            print('self.batch_embedded:',self.batch_embedded.shape) # shape [?,self.sequence_length,self.embedding_dim]
            
        # Positional encoding
        with tf.name_scope('positional_encoding'):
            positional_encoded = self.positional_encoding(self.model_dim,
                                                    self.sequence_length,
                                                    dtype = tf.float32)

            position_embedded = tf.nn.embedding_lookup(positional_encoded,self.batch_ph)
            print('position_embedded.shape:',position_embedded.shape)
            
            
            encoded_inputs = tf.add(self.batch_embedded,position_embedded)
        
        encoder_emb_inp = tf.nn.dropout(encoded_inputs,1.0-self.dropout)
        
        # self-attention
        with tf.name_scope('self-attention'):
            o1 = tf.identity(encoder_emb_inp)
            attention_ = Attention(num_heads=self.num_head,
                                  masked=False,
                                  linear_key_dim=self.linear_key_dim,
                                  linear_value_dim=self.vocab_size,
                                  model_dim=self.model_dim,
                                  dropout = self.dropout)
            multi_head_output = attention_.multi_head(o1,o1,o1)

        o2 = tf.contrib.layers.layer_norm(tf.add(o1, multi_head_output))
        # ffn = FFN(w1_dim=self.model_dim, w2_dim=self.model_dim, dropout=self.dropout)
        o1 = tf.identity(o2)

        # multi-layers
        # with tf.name_scope('multi-layers'):
        #     for i in range(1,self.num_layers+1):
        #         with tf.variable_scope(f'layer-{i}'):
        #             o2 = tf.contrib.layers.layer_norm(tf.add(o1,multi_head_output))
        #             ffn = FFN(w1_dim=self.model_dim,w2_dim=self.model_dim,dropout=self.dropout)
        #             o21 = ffn.dense_relu_dense(o2)
        #             o3 = tf.contrib.layers.layer_norm(tf.add(o2,o21))
        #             o1 = tf.identity(o3)

        o4 = tf.reduce_sum(o1, axis=2)
        print('o1.shape:',o4.shape)
        # add_layer
        with tf.name_scope('fully_layer'):
            inputsize = int(o4.shape[-1])
            w = tf.Variable(tf.random_normal([inputsize,1],-0.05,0.05))
            f = tf.matmul(o4,w)
            y_hat = tf.squeeze(f)
        
        with tf.name_scope('Metrics'):
            # Cross-entropy loss and optimizer initialization
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=self.target_ph))
            tf.summary.scalar('loss', loss)
            optimizer = tf.train.AdamOptimizer(learning_rate=1e-3,
                                               beta1 = 0.9,
                                               beta2 = 0.999,
                                               epsilon = 1e-08,
                                               use_locking = True).minimize(loss)

            accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), self.target_ph), tf.float32))
            tf.summary.scalar('accuracy', accuracy)

        y_hat_ = tf.round(tf.sigmoid(y_hat))
        merged = tf.summary.merge_all() # 合并summary operation,运行初始化变量
        # Batch genetators
        train_batch_generator = self.batch_generator(self.X_train, self.y_train, self.batch_size)
        test_batch_generator = self.batch_generator(self.X_test, self.y_test, self.batch_size)

        session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
        saver = tf.train.Saver()

        train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph)
        test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph)

        with tf.Session(config=session_conf) as sess:
            sess.run(tf.global_variables_initializer())
            print('Start learning...')
            fp = open('./result.txt', 'w', encoding='utf8')
            for epoch in range(self.numb_epoch):
                fp.write(str(epoch) + '\n')
                print('epoch:%d' % epoch)
                loss_train = 0
                loss_test = 0
                accuracy_train = 0
                accuracy_test = 0

                print('epoch:{}\t'.format(epoch), end="")
                # Training
                num_batches = self.X_train.shape[0] // self.batch_size
                train_yT, train_pred, alphas_qq, alphas_qq1 = [], [], [], []
                for b in tqdm(range(num_batches)):
                    x_batch, y_batch, indice = next(train_batch_generator)
                    loss_tr, acc, _, summary, y_pred = sess.run([loss, accuracy, optimizer, merged, y_hat_],
                                                                feed_dict={self.batch_ph: x_batch,
                                                                           self.target_ph: y_batch,
                                                                           self.keep_prob_ph: self.keep_prob})

                    train_yT.extend(y_batch.tolist())
                    train_pred.extend(y_pred.tolist())
                    accuracy_train += acc
                    loss_train = loss_tr * self.delta + loss_train * (1 - self.delta)
                    train_writer.add_summary(summary, b + num_batches * epoch)
                accuracy_train /= num_batches
                precision_train = metrics.precision_score(train_yT, train_pred, average='macro')
                recall_train = metrics.recall_score(train_yT, train_pred)
                f1_train = metrics.f1_score(train_yT, train_pred)
                print('loss:{:.4f},acc:{:.4f},precision:{:.4f},recall:{:.4f},f1_score:{:.4f}'.format(
                    loss_train, accuracy_train, precision_train, recall_train, f1_train))
                fp.write('train_loss:' + str(loss_train) + ' ' + 'train_acc:' + str(
                    accuracy_train) + ' ' + 'train_precision:' + str(precision_train) +
                         ' ' + 'train_recall:' + str(recall_train) + ' ' + 'train_f1_score:' + str(f1_train) + '\n')

                # Testing
                test_yT, test_pred = [], []
                num_batches = self.X_test.shape[0] // self.batch_size
                for b in tqdm(range(num_batches)):
                    x_batch, y_batch, indice = next(test_batch_generator)
                    loss_test_batch, acc, summary, y_pred = sess.run([loss, accuracy, merged, y_hat_],
                                                                     feed_dict={self.batch_ph: x_batch,
                                                                                self.target_ph: y_batch,
                                                                                self.keep_prob_ph: 1.0})

                    test_yT.extend(y_batch.tolist())
                    test_pred.extend(y_pred.tolist())
                    accuracy_test += acc
                    loss_test += loss_test_batch
                    test_writer.add_summary(summary, b + num_batches * epoch)
                accuracy_test /= num_batches
                loss_test /= num_batches
                precision_test = metrics.precision_score(test_yT, test_pred)
                recall_test = metrics.recall_score(test_yT, test_pred)
                f1_test = metrics.f1_score(test_yT, test_pred)
                print(
                    'loss_test:{:.4f},accuracy_test:{:.4f},precision_test:{:.4f},recall_test:{:.4f},f1_score_test:{:.4f}'.format(
                        loss_test, accuracy_test, precision_test, recall_test, f1_test))
                fp.write('test_loss:' + str(loss_test) + ' ' + 'test_acc:' + str(
                    accuracy_test) + ' ' + 'test_precision:' + str(precision_test) +
                         ' ' + 'test_recall:' + str(recall_test) + ' ' + 'test_f1_score:' + str(f1_test) + '\n')
                saver.save(sess, self.model_path + str(epoch))

            train_writer.close()
            test_writer.close()
            fp.close()
            print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")
    def ocr_test(self, vocab_size, max_seq=25, batch_size=8):
        """
            Builds the graph, returns the "important" ops
        """
        inputs = tf.placeholder(tf.float32, [None, 128, 400, 3])
        output = tf.placeholder(tf.int32, [None, max_seq])
        length = tf.placeholder(tf.int32, [None])
        resnet_34 = ResNet(34, 10)

        def resnet_34_backbone(x):
            out = resnet_34.network(x, is_training=False)
            print(out)
            return out

        feature_map_resnet = resnet_34_backbone(
            inputs)  #feature map of resnet 34
        feature_map = transform_dimension(feature_map_resnet, 1024)
        # print("feature map: ", feature_map)
        for i in range(6):
            global_representation = bottle_resblock(
                feature_map_resnet if i == 0 else global_representation,
                512,
                is_training=False,
                scope='bottle_resblock_' + str(i))
        global_representation = global_avg_pooling(global_representation)
        global_representation = fully_conneted(global_representation, 512)

        ##########################################################DECODER########################################
        def decoder_embedding(y, vocab_size, embed_size=512, shifted=True):
            embeddings = tf.random_normal(shape=(vocab_size, embed_size))
            embedded = tf.nn.embedding_lookup(embeddings, y)
            return embedded

        def positional_encoding(x):
            """
                Not as described in paper since it lacked proper description of this step.
                This function is based on the "Attention is all you need" paper.
            """
            seq_len, dim = x.get_shape().as_list()[-2:]
            encoded_vec = np.array([
                pos / np.power(10000, 2 * i / dim) for pos in range(seq_len)
                for i in range(dim)
            ])
            encoded_vec[::2] = np.sin(encoded_vec[::2])
            encoded_vec[1::2] = np.cos(encoded_vec[1::2])
            encoded_vec_tensor = tf.convert_to_tensor(encoded_vec.reshape(
                [seq_len, dim]),
                                                      dtype=tf.float32)
            return tf.add(x, encoded_vec_tensor)

        def layer_norm(x):
            """
                Layer normalization as described in paper (p.4)
            """
            return tf.contrib.layers.layer_norm(x)

        y = decoder_embedding(output, vocab_size)

        y = tf.pad(y, [[0, 0], [1, 0], [0, 0]
                       ])[:, :-1, :]  #shift right from official transformer
        # print("embedding: ", y)
        y = positional_encoding(y)
        # print("PE: ", y)               #(bs, seq_len, 512)

        #concatenate with global representation
        decoder_input = []
        for i in range(y.get_shape().as_list()[1]):
            decoder_input.append(
                tf.concat([global_representation, y[:, i, :]],
                          1))  #(bs, 1, 512)
        decoder_input = tf.stack(decoder_input, 1)

        ####MASKED SELF ATTENTION###
        masked_self_attention = Attention(dropout=0)
        decoder_output = masked_self_attention.multi_head(
            decoder_input, decoder_input, decoder_input)
        norm_1 = layer_norm(decoder_output)
        decoder_output = decoder_input + norm_1

        ###2D self attention###
        two_D_attention = Attention(masked=False, dropout=0)
        rrr = feature_map.get_shape().as_list()[1] * feature_map.get_shape(
        ).as_list()[2]
        enc_reshape = tf.reshape(
            feature_map,
            [-1, rrr, decoder_output.get_shape().as_list()[-1]])
        decoder_output_2 = two_D_attention.multi_head(decoder_output,
                                                      enc_reshape, enc_reshape)
        norm_2 = layer_norm(decoder_output_2)
        decoder_output = decoder_output + norm_2

        def position_wise_feed_forward_network(x):
            """
                Position-wise Feed-Forward Network as described in paper (p.4)
            """

            # First linear
            #linear_1 = tf.layers.dense(x, x.get_shape().as_list()[-1])
            linear_1 = tf.layers.conv1d(x, 2048, 1)

            # ReLU operation
            relu_1 = tf.nn.relu(linear_1)

            # Second linear
            linear_2 = tf.layers.conv1d(relu_1, x.get_shape().as_list()[-1], 1)

            return tf.nn.dropout(linear_2, 1)

        pwff = position_wise_feed_forward_network(decoder_output)
        norm_3 = layer_norm(pwff)
        decoder_output = decoder_output + norm_3

        # output_probabilities = tf.layers.dense(decoder_output, vocab_size, activation=tf.contrib.layers.softmax)
        output_probabilities = tf.layers.dense(decoder_output, vocab_size)

        ids, log_probs, scores = self.char_predictions(output_probabilities,
                                                       vocab_size, max_seq)
        probs = tf.nn.softmax(output_probabilities)
        init = tf.global_variables_initializer()
        return inputs, output, length, scores, ids, probs, init
예제 #7
0
    def convolutional_attention_network(self,
                                        vocab_size,
                                        max_seq=25,
                                        batch_size=8):
        """
            Builds the graph
        """
        inputs = tf.placeholder(tf.float32, [batch_size, 128, 400, 3])
        output = tf.placeholder(tf.int32, [batch_size, max_seq])
        length = tf.placeholder(tf.int32, [batch_size])
        resnet_34 = ResNet(34, 10)

        def resnet_34_backbone(x):
            out = resnet_34.network(x)
            print(out)
            return out

        feature_map_resnet = resnet_34_backbone(
            inputs)  #feature map of resnet 34
        feature_map = transform_dimension(feature_map_resnet, 1024)
        for i in range(6):
            global_representation = bottle_resblock(
                feature_map_resnet if i == 0 else global_representation,
                512,
                scope='bottle_resblock_' + str(i))
        global_representation = global_avg_pooling(global_representation)
        global_representation = fully_conneted(global_representation, 512)

        ##########################################################DECODER########################################
        def decoder_embedding(y, vocab_size, embed_size=512, shifted=True):
            embeddings = tf.random_normal(shape=(vocab_size, embed_size))
            embedded = tf.nn.embedding_lookup(embeddings, y)
            return embedded

        def positional_encoding(x):
            seq_len, dim = x.get_shape().as_list()[-2:]
            encoded_vec = np.array([
                pos / np.power(10000, 2 * i / dim) for pos in range(seq_len)
                for i in range(dim)
            ])
            encoded_vec[::2] = np.sin(encoded_vec[::2])
            encoded_vec[1::2] = np.cos(encoded_vec[1::2])
            encoded_vec_tensor = tf.convert_to_tensor(encoded_vec.reshape(
                [seq_len, dim]),
                                                      dtype=tf.float32)
            return tf.add(x, encoded_vec_tensor)

        def layer_norm(x):
            return tf.contrib.layers.layer_norm(x)

        y = decoder_embedding(output, vocab_size)

        y = tf.pad(y, [[0, 0], [1, 0], [0, 0]
                       ])[:, :-1, :]  #shift right from official transformer
        y = positional_encoding(y)  #(bs, seq_len, 512)

        #concatenate with global representation
        decoder_input = []
        for i in range(y.get_shape().as_list()[1]):
            decoder_input.append(
                tf.concat([global_representation, y[:, i, :]],
                          1))  #(bs, 1, 512)
        decoder_input = tf.stack(decoder_input, 1)  #(bs, seq_len, 1024)

        ####MASKED SELF ATTENTION###
        masked_self_attention = Attention(dropout=0)
        decoder_output = masked_self_attention.multi_head(
            decoder_input, decoder_input, decoder_input)
        norm_1 = layer_norm(decoder_output)
        decoder_output = decoder_input + norm_1

        ###2D self attention###
        two_D_attention = Attention(masked=False, dropout=0)
        enc_reshape = tf.reshape(feature_map, [
            decoder_output.get_shape().as_list()[0], -1,
            decoder_output.get_shape().as_list()[-1]
        ])
        decoder_output_2 = two_D_attention.multi_head(decoder_output,
                                                      enc_reshape, enc_reshape)
        norm_2 = layer_norm(decoder_output_2)
        decoder_output = decoder_output + norm_2

        def position_wise_feed_forward_network(x):  #using conv1D
            # First linear
            linear_1 = tf.layers.conv1d(x, 2048, 1)
            # ReLU operation
            relu_1 = tf.nn.relu(linear_1)
            # Second linear
            linear_2 = tf.layers.conv1d(relu_1, x.get_shape().as_list()[-1], 1)
            return tf.nn.dropout(linear_2, 1)

        pwff = position_wise_feed_forward_network(decoder_output)
        norm_3 = layer_norm(pwff)
        decoder_output = decoder_output + norm_3

        output_probabilities = tf.layers.dense(decoder_output, vocab_size)

        loss = self._compute_loss(output_probabilities, output, length,
                                  batch_size)
        ids, log_probs, scores = self.char_predictions(output_probabilities,
                                                       vocab_size, max_seq)
        char_acc = char_accuracy(ids, output, 0)
        word_acc = sequence_accuracy(ids, output, 0)

        with tf.name_scope('summaries'):
            tf.summary.scalar("loss", loss, collections=["train_summary"])
            tf.summary.scalar("character accuracy",
                              char_acc,
                              collections=["train_summary"])
            tf.summary.scalar("word accuracy",
                              word_acc,
                              collections=["train_summary"])

        summary_op = tf.summary.merge_all(key='train_summary')

        optimizer = tf.train.AdadeltaOptimizer(learning_rate=1).minimize(loss)

        init = tf.global_variables_initializer()
        return inputs, output, length, loss, optimizer, output_probabilities, summary_op, init, word_acc