Esempio n. 1
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            en2idx, idx2en = load_en_vocab()
            zh2idx, idx2zh = load_zh_vocab()

            # initialize transformer
            transformer = vanilla_transformer(hp, self.is_training)
            self.enc = transformer.encode(self.x, len(en2idx))

            # Decoder
            self.dec = transformer.decode(self.decoder_inputs, self.enc,
                                          len(zh2idx), hp.maxlen)

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(zh2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # Loss
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(zh2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 2
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        self.vocab_size = len(
            load_doc_vocab()[0])  # load_doc_vocab returns: de2idx, idx2de

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32,
                                        shape=(None, hp.article_maxlen))
                self.y = tf.placeholder(tf.int32,
                                        shape=(None, hp.summary_maxlen))

            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2:<S> # define decoder inputs

            self._add_encoder(is_training=is_training)
            self.ml_loss = self._add_ml_loss(is_training=is_training)
            self.loss = self.ml_loss

            if is_training:
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)

                grads_and_vars_ml = self.optimizer.compute_gradients(
                    loss=self.ml_loss)
                grad_ml, vars_ml = zip(
                    *grads_and_vars_ml)  # parse grad and var

                # add gradient clipping
                clipped_grad_ml, globle_norm_ml = tf.clip_by_global_norm(
                    grad_ml, hp.maxgradient)
                self.globle_norm_ml = globle_norm_ml
                self.train_op_ml = self.optimizer.apply_gradients(
                    grads_and_vars=zip(clipped_grad_ml, vars_ml),
                    global_step=self.global_step)
                '''
                # training wihtout gradient clipping
                self.train_op_ml  = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_ml,
                                                                   global_step=self.global_step)
                '''

                # Summary
                tf.summary.scalar('globle_norm_ml', globle_norm_ml)
                tf.summary.scalar('loss', self.loss)

                self.merged = tf.summary.merge_all()

        self.filewriter = tf.summary.FileWriter(hp.tb_dir + '/train',
                                                self.graph)
Esempio n. 3
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # inputs
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, 9, 9)
            else:
                self.x = tf.placeholder(tf.float32, (None, 9, 9))
                self.y = tf.placeholder(tf.int32, (None, 9, 9))
            self.enc = tf.expand_dims(self.x, axis=-1)  # (N, 9, 9, 1)
            self.istarget = tf.to_float(tf.equal(self.x, tf.zeros_like(
                self.x)))  # 0: blanks

            # network
            for i in range(hp.num_blocks):
                with tf.variable_scope("conv2d_{}".format(i)):
                    self.enc = conv(self.enc,
                                    filters=hp.num_filters,
                                    size=hp.filter_size,
                                    is_training=is_training,
                                    norm_type="bn",
                                    activation_fn=tf.nn.relu)

            # outputs
            self.logits = conv(self.enc, 10, 1, scope="logits")  # (N, 9, 9, 1)
            self.probs = tf.reduce_max(tf.nn.softmax(self.logits),
                                       axis=-1)  #(N, 9, 9)
            self.preds = tf.to_int32(tf.arg_max(self.logits,
                                                dimension=-1))  #(N, 9, 9)

            # accuracy
            self.hits = tf.to_float(tf.equal(self.preds,
                                             self.y)) * self.istarget
            self.acc = tf.reduce_sum(
                self.hits) / (tf.reduce_sum(self.istarget) + 1e-8)
            tf.summary.scalar("acc", self.acc)

            if is_training:
                # Loss
                self.ce = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.y, logits=self.logits)
                self.loss = tf.reduce_sum(
                    self.ce * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                self.train_op = self.optimizer.minimize(
                    self.loss, global_step=self.global_step)
                tf.summary.scalar("loss", self.loss)

            self.merged = tf.summary.merge_all()
Esempio n. 4
0
    def Graph(self):
        graph = tf.Graph()
        with graph.as_default():
            if self.is_training:
                next_element, iterator, num_batch = get_batch_data(self.is_training)
                self.X, self.Y, self.seq_len = next_element["X"], next_element["Y"], next_element["seq_len"]

            else:
                self.X = tf.placeholder(tf.int32, shape=(None, config.maxlen))
                self.Y = tf.placeholder(tf.int32, shape=(None, config.maxlen))
                self.seq_len = tf.placeholder(tf.int32, shape=(None))
            idx2word, word2idx, idx2labl, labl2idx  = load_vocab()
            embed = embedding(self.X,len(word2idx),config.embed_dim,
                                       config.use_pretrain)

            if config.embeddig_mode=="concat":
                assert config.embed_dim==config.position_dim
            #TODO this part still dont know how to complete better!

            elif config.embeddig_mode=="add":
                embed+=position_encoding(self.X,config.position_dim,
                                                 config.sinusoid)
            # input embedding Dropout
            embed = tf.layers.dropout(embed,rate=config.dropout_rate,training=self.is_training)
            #Muilty layer Bilstm 
            outputs = multibilstm(embed,self.seq_len,config.num_units,config.num_layer,self.is_training,config.cell)
            
            #full connect layer
            # here we use two layer full connect layer. residual and activation can be set by your self. 
            outputs = feedforward(outputs,outputs.get_shape().as_list()[2],scope="first")#residual default used
            outputs = feedforward(outputs,config.num_class,residual=False,scope="second")
            noutput = tf.reshape(outputs, [-1, config.maxlen, config.num_class])

            # crf layer
            if config.use_crf:
                loss, acc, predicts,true_labels = crf_layer(self.Y,noutput,config.num_class,self.seq_len,self.is_training)
            else:
                loss, acc, predicts, true_labels = loss_layer(self.Y, noutput, config.num_class)
            tf.summary.scalar('acc',acc)
            global_step = tf.Variable(0, name='global_step')
            if self.is_training:
                # use exponential_decay to help the model fit quicker
                if config.exponential_decay:
                    learning_rate = tf.train.exponential_decay(
                        config.lr,global_step, 200, 0.96, staircase=True
                    )
                # optimizer = tf.train.AdamOptimizer(learning_rate=config.lr, beta1=0.9, beta2=0.99, epsilon=1e-8)
                optimizer = tf.train.RMSPropOptimizer(learning_rate=config.lr)
                train_op = optimizer.minimize(loss, global_step=global_step)
                tf.summary.scalar('mean_loss',loss)
            else:
                train_op=None
            return graph,train_op,loss, acc, predicts,true_labels,global_step
Esempio n. 5
0
 def __init__(self, is_training):
     self.de2idx, _idx2de = load_de_vocab()
     self.en2idx, _idx2en = load_en_vocab()
     self.is_training = is_training
     self.graph = tf.Graph()
     with self.graph.as_default():
         if self.is_training:
             self.x, self.y, self.num_batch = get_batch_data() # (N, T)
         else: # inference
             self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
             self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
         self.x_len = tf.reduce_sum(self.x, axis=-1)
         self.y_len = tf.reduce_sum(self.y, axis=-1)
         self.global_step = tf.Variable(0, name='global_step', trainable=False)
         self.batch_size = tf.shape(self.x)[0]
Esempio n. 6
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Load data
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, 60))

            # Load vocabulary
            nucl2idx, idx2nucl = load_vocab()

            # Encoder
            ## Embedding
            enc = embedding(self.x,
                            zero_pad=False,
                            vocab_size=len(nucl2idx),
                            num_units=hp.hidden_units,
                            scale=False,
                            scope="enc_embed")

            # Encoder pre-net
            prenet_out = prenet(
                enc,
                num_units=[hp.hidden_units, hp.hidden_units // 2],
                dropout_rate=hp.dropout_rate,
                is_training=is_training)  # (N, T, E/2)

            # Encoder CBHG
            ## Conv1D bank
            enc = conv1d_banks(prenet_out,
                               K=hp.encoder_num_banks,
                               num_units=hp.hidden_units // 2,
                               norm_type=hp.norm_type,
                               is_training=is_training)  # (N, T, K * E / 2)

            # ### Max pooling
            # enc = tf.layers.max_pooling1d(enc, 2, 2, padding="same")  # (N, T, K * E / 2)

            ### Conv1D projections
            enc = conv1d(enc, hp.hidden_units // 2, 3,
                         scope="conv1d_1")  # (N, T, E/2)
            enc = normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm1")
            enc = conv1d(enc, hp.hidden_units // 2, 3,
                         scope="conv1d_2")  # (N, T, E/2)
            enc = normalize(enc,
                            type=hp.norm_type,
                            is_training=is_training,
                            activation_fn=tf.nn.relu,
                            scope="norm2")
            enc += prenet_out  # (N, T, E/2) # residual connections

            ### Highway Nets
            for i in range(hp.num_highwaynet_blocks):
                enc = highwaynet(
                    enc,
                    num_units=hp.hidden_units // 2,
                    scope='highwaynet_{}'.format(i))  # (N, T, E/2)

            # Final linear projection
            _, T, E = enc.get_shape().as_list()
            enc = tf.reshape(enc, (-1, T * E))
            self.logits = tf.squeeze(tf.layers.dense(enc, 1))

            if is_training:
                # Loss
                if hp.loss_type == "l1":
                    self.loss = tf.reduce_mean(tf.abs(self.logits - self.y))
                else:  # l2
                    self.loss = tf.reduce_mean(
                        tf.squared_difference(self.logits, self.y))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr)
                self.train_op = self.optimizer.minimize(
                    self.loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('loss', self.loss)
                tf.summary.merge_all()
Esempio n. 7
0
        pred, _ = transformer(inputs, tar_inp, True, encoder_padding_mask,
                              look_ahead_mask, decoder_padding_mask)
        loss = loss_fun(tar_real, pred)
        # 求梯度
        gradients = tape.gradient(loss, transformer.trainable_variables)
        # 反向传播
        optimizer.apply_gradients(
            zip(gradients, transformer.trainable_variables))
        # 记录loss和acc
        train_loss(loss)
        train_acc(tar_real, pred)


for epoch in range(hp.EPOCHS):
    start_time = time.time()
    # 重置
    train_loss.reset_states()
    train_acc.reset_states()
    for step, (inputs, targets) in enumerate(get_batch_data()):
        print(inputs)
        train_step(inputs, targets)
        if step % 10 == 0:
            print(' epoch{},step:{}, loss:{:.4f}, acc:{:.4f}'.format(
                epoch, step, train_loss.result(), train_acc.result()))
    if epoch % 2 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('epoch{}, save model at {}'.format(epoch, ckpt_save_path))
    print('epoch:{}, loss:{:.4f}, acc:{:.4f}'.format(epoch,
                                                     train_loss.result(),
                                                     train_acc.result()))
    print('time in one epoch:{}'.format(time.time() - start_time))
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            en2idx, idx2en = load_en_vocab()
            ch2idx, idx2ch = load_ch_vocab()

            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)

                        ### Feed Forward
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(ch2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                ## Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            # 对最后一维做线性变换成词库这么长,对应每个单词的logits,然后将logits最大的索引记录下来,即预测值
            self.logits = tf.layers.dense(self.dec,
                                          len(ch2idx))  #(N, T, vocab_len)
            self.preds = tf.to_int32(tf.arg_max(self.logits,
                                                dimension=-1))  # (N, T)
            # 把y中所有不是<PAD>出来的都由True转化为1.0
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            # acc表示的是  (一个batch中所有的非<PAD>的单词,预测对的数量求和)/(一个batch中所有的非<PAD>单词数量)
            # tips:tf.reduce_sum()未指定axis,即把所有维度都加起来
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            # 计算acc给summary监督学习过程。
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # Loss
                # tf.one_hot(tensor, int),构造一个len(tensor)*int的tensor,tensor的值变成索引,对应位置为1.,其他为0.
                # 如果索引值大于int大小,则整行都是0.
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(
                        ch2idx)))  #y_smoothed因为one_hot变成了(N, T, vocab_len)
                # tf.nn.softmax_cross_entropy_with_logits实际上做的事情是:
                # 1.先对logits求softmax   2.再将vocab_len上的分布和y_label做交叉熵,得到一个(N, T)的向量
                # 即每一单词有一个loss
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)  # (N, T)
                # 将<PAD>出来的部分的loss去掉,再求mean_loss
                self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / (
                    tf.reduce_sum(self.istarget))  #标量scale

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 9
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data() # (N, T)
            else: # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S>

            # Load vocabulary    
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()
            
            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x, 
                                      vocab_size=len(de2idx), 
                                      num_units=hp.hidden_units, 
                                      scale=True,
                                      scope="enc_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="enc_pe")
                    
                 
                ## Dropout
                self.enc = tf.layers.dropout(self.enc, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(queries=self.enc, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=False)
                        
                        ### Feed Forward
                        self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units])
            
            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs, 
                                      vocab_size=len(en2idx), 
                                      num_units=hp.hidden_units,
                                      scale=True, 
                                      scope="dec_embed")
                
                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.maxlen, 
                                      num_units=hp.hidden_units, 
                                      zero_pad=False, 
                                      scale=False,
                                      scope="dec_pe")
                
                ## Dropout
                self.dec = tf.layers.dropout(self.dec, 
                                            rate=hp.dropout_rate, 
                                            training=tf.convert_to_tensor(is_training))
                
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.dec, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads, 
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training,
                                                        causality=True, 
                                                        scope="self_attention")
                        
                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(queries=self.dec, 
                                                        keys=self.enc, 
                                                        num_units=hp.hidden_units, 
                                                        num_heads=hp.num_heads,
                                                        dropout_rate=hp.dropout_rate,
                                                        is_training=is_training, 
                                                        causality=False,
                                                        scope="vanilla_attention")
                        
                        ## Feed Forward
                        self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units])
                
            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)
                
            if is_training:  
                # Loss
                self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
               
                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
                   
                # Summary 
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 10
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x1, self.x2, self.y, self.num_batch = get_batch_data()
                #self.x, self.label, self.num_batch = get_batch_data() # (N, T)
                #self.y = tf.one_hot(self.label, depth = hp.n_class)

            else:  # inference
                self.x1 = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.x2 = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                #self.label = tf.placeholder(tf.int32, shape = (None, hp.n_class))
                #self.y = tf.placeholder(tf.int32, shape = (None, hp.n_class))
                #self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            self.l2_loss = tf.constant(0.0)
            # define decoder inputs
            #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2)
            #to get a more sementic relationship across corpus
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.x2[:, :1]) * 2, self.x2[:, :-1]),
                -1)  # 2:<S>

            # Load vocabulary
            word2idx, idx2word = load_vocabs()

            # initialize transformer
            transformer = vanilla_transformer(hp, self.is_training)

            #encode
            self.encode1, self.encode2 = transformer.encode(self.x1, len(word2idx)), \
                transformer.encode(self.x2, len(word2idx))

            #concated
            self.enc = tf.divide(tf.add(self.encode1, encode2), 2)
            self.enc = normalize(self.enc)

            #for sentence relationship learning task we want to encoder sent1 to e1, then decoder(e1 + sent2)
            #to get a more sementic relationship across corpus

            # Decoder
            self.dec = transformer.decode(self.decoder_inputs, self.enc,
                                          len(word2idx), hp.p_maxlen)

            self.logits = tf.add(self.enc, tf.multiply(self.enc, self.dec))
            #self.logits = self.enc

            #self.logits = tf.layers.dense(self.logits, 64, activation = 'tanh')
            self.logits = tf.layers.flatten(self.logits)
            #self.logits = tf.reshape(self.logits, [64, -1])
            self.h_drop = tf.nn.dropout(self.logits, hp.dropout_keep_prob)

            with tf.name_scope("output_logit"):
                W = tf.get_variable(
                    "W",
                    shape=[hp.maxlen * hp.hidden_units,
                           len(hp.relations)],
                    initializer=tf.contrib.layers.xavier_initializer())

                b = tf.Variable(tf.constant(0.1, shape=[len(hp.relations)]),
                                name="b")
                self.l2_loss += tf.nn.l2_loss(W)
                self.l2_loss += tf.nn.l2_loss(b)
                self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name="logit")
                #self.preds = tf.argmax(self.scores, 1, name="predictions")

            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))

            if is_training:
                self.y_hotting = tf.one_hot(self.y, depth=len(hp.relations))

                #Accuracy
                self.cpl = tf.equal(tf.convert_to_tensor(self.y, tf.int32),
                                    self.preds)
                self.cpl = tf.to_int32(self.cpl)
                self.acc = tf.reduce_sum(self.cpl) / tf.to_int32(
                    tf.reduce_sum(self.y_hotting))
                tf.summary.scalar('acc', self.acc)

                # Loss
                #self.y_smoothed = label_smoothing(self.y_hotting)
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_hotting)
                self.mean_loss = (tf.reduce_sum(
                    self.loss) + self.l2_loss * hp.reg_lambda) / tf.reduce_sum(
                        self.y_hotting)

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 11
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.q, self.p, self.q_length, self.p_length, \
                self.start_label, self.end_label, self.num_batch = get_batch_data() 
                self.dropout_keep_prob = hp.dropout_keep_prob

            else: # inference
                self.q = tf.placeholder(tf.int32, [None, hp.q_maxlen])
                self.p = tf.placeholder(tf.int32, [None, hp.p_maxlen])
                self.q_length = tf.placeholder(tf.int32, [None])
                self.p_length = tf.placeholder(tf.int32, [None])
                self.start_label = tf.placeholder(tf.int32, [None])
                self.end_label = tf.placeholder(tf.int32, [None])

            self.dropout_keep_prob = hp.dropout_keep_prob
            self.l2_loss = tf.constant(0.0)
            # define decoder input
            self.decoder_inputs = tf.concat((tf.ones_like(self.p[:, :1])*2, self.p[:, :-1]), -1) # 2:<S>

            # Load vocabulary    
            word2idx, idx2word = load_vocabs()

            # initialize transformer
            transformer = vanilla_transformer(hp, self.is_training)
            ### encode
            self.q_encodes, self.p_encodes = transformer.encode(self.q, len(word2idx)), \
                transformer.encode(self.q, len(word2idx))

            #concated features to attend p with q
            # first pad q_encodes to the length of p_encodes
            pad_dim = hp.p_maxlen - hp.q_maxlen
            pad_ = tf.zeros([tf.shape(self.q_encodes)[0], pad_dim, hp.hidden_units], dtype = self.q_encodes.dtype)
            self.padded_q_encodes = tf.concat([self.q_encodes, pad_,], 1)
            #normalization
            self.padded_q_encodes = normalize(self.padded_q_encodes)

            # Decoder
            self.dec = transformer.decode(self.decoder_inputs, self.padded_q_encodes, len(word2idx), hp.p_maxlen)

            # fix paragraph tensor with self.dec
            self.p_encodes = self.dec

            """
            The core of RC model, get the question-aware passage encoding
            """
            match_layer = AttentionFlowMatchLayer(hp.hidden_units)
            self.match_p_encodes, _ = match_layer.match(self.p_encodes, self.q_encodes,
                                                        self.p_length, self.q_length)

            # pooling or bi-rnn to fuision passage encodes
            if hp.Passage_fuse == 'Pooling':
                #pooling layer
                self.match_p_encodes = \
                tf.keras.layers.MaxPool1D(pool_size=4, strides=None, padding='valid')\
                                        (self.match_p_encodes)

                self.match_p_encodes = tf.reshape(self.match_p_encodes, [-1, hp.p_maxlen, hp.hidden_units])
                #normalization
                self.match_p_encodes = tf.layers.batch_normalization(self.match_p_encodes)
                if hp.use_dropout:
                    self.match_p_encodes = tf.nn.dropout(self.match_p_encodes, self.dropout_keep_prob)
            elif hp.Passage_fuse == 'bi-rnn':
                self.fuse_p_encodes, _ = rnn('bi-lstm', self.match_p_encodes, self.p_length,
                                             hp.hidden_units, layer_num=1, concat = False)
                if hp.use_dropout:
                    self.fuse_p_encodes = tf.nn.dropout(self.fuse_p_encodes, self.dropout_keep_prob)


            decoder = PointerNetDecoder(hp.hidden_units)
            self.start_probs, self.end_probs = decoder.decode(self.match_p_encodes,
                                                              self.q_encodes)

                
            if is_training:  
                self.start_loss = self.sparse_nll_loss(probs=self.start_probs, labels=self.start_label)
                self.end_loss = self.sparse_nll_loss(probs=self.end_probs, labels=self.end_label)
                self.all_params = tf.trainable_variables()
                self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss))
                if hp.weight_decay > 0:
                    with tf.variable_scope('l2_loss'):
                        l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params])
                    self.loss += hp.weight_decay * l2_loss



                # Training Scheme
                self.global_step = tf.Variable(0, name='global_step', trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
                self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step)
                   
                # Summary 
                tf.summary.scalar('mean_loss', self.loss)
                self.merged = tf.summary.merge_all()
Esempio n. 12
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 13
0
    def __init__(self, is_training):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.is_training = is_training

            if self.is_training:
                self.next_element, num_batch = get_batch_data(self.is_training)
                self.X, self.Y, self.seq_len = self.next_element[
                    "X"], self.next_element["Y"], self.next_element["seq_len"]
                self.X.set_shape([None, config.maxlen, config.bert_dim])
                self.Y.set_shape([None, config.maxlen])
                self.seq_len.set_shape([None])
            else:
                self.X = tf.placeholder(tf.float32,
                                        shape=(None, config.maxlen,
                                               config.bert_dim))
                self.Y = tf.placeholder(tf.int32, shape=(None, config.maxlen))
                self.seq_len = tf.placeholder(tf.int32, shape=(None))

            idx2word, word2idx, idx2labl, labl2idx = load_vocab()
            embed = tf.convert_to_tensor(self.X)
            # input embedding Dropout
            embed = tf.layers.dropout(embed,
                                      rate=config.dropout_rate,
                                      training=self.is_training)
            #Muilty layer Bilstm
            outputs = multibilstm(embed, self.seq_len, config.num_units,
                                  config.num_layer, self.is_training,
                                  config.cell)

            #full connect layer
            # here we use two layer full connect layer. residual and activation can be set by your self.
            outputs = feedforward(outputs,
                                  outputs.get_shape().as_list()[2],
                                  scope="first")  #residual default used
            outputs = feedforward(outputs,
                                  config.num_class,
                                  residual=False,
                                  scope="second")
            noutput = tf.reshape(outputs,
                                 [-1, config.maxlen, config.num_class])

            # crf layer
            if config.use_crf:
                self.loss, self.acc, self.predicts, self.true_labels = crf_layer(
                    self.Y, noutput, config.num_class, self.seq_len,
                    self.is_training)
            else:
                self.loss, self.acc, self.predicts, self.true_labels = loss_layer(
                    self.Y, noutput, config.num_class)
            tf.summary.scalar('acc', self.acc)
            self.global_step = tf.Variable(0, name='global_step')
            if self.is_training:
                # use exponential_decay to help the model fit quicker
                if config.exponential_decay:
                    learning_rate = tf.train.exponential_decay(
                        config.lr, self.global_step, 200, 0.96, staircase=True)
                # optimizer = tf.train.AdamOptimizer(learning_rate=config.lr, beta1=0.9, beta2=0.99, epsilon=1e-8)
                optimizer = tf.train.RMSPropOptimizer(learning_rate=config.lr)
                self.train_op = optimizer.minimize(
                    self.loss, global_step=self.global_step)
                tf.summary.scalar('mean_loss', self.loss)
            else:
                train_op = None
Esempio n. 14
0
    def __init__(self, hp, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.x_image, self.y_image, self.x_length, self.y, self.num_batch, self.source, self.target, self.x_turn_number, self.x_emotion, self.y_emotion, self.speaker, self.A = get_batch_data(
                    hp)  # (N, T)
            else:  # inference
                self.x = tf.placeholder(
                    tf.int32,
                    shape=(None, hp.max_turn, hp.maxlen))  # shape=(16, 15, 50)
                self.x_image = tf.placeholder(tf.float32,
                                              shape=(None, hp.max_turn, 17))
                self.y_image = tf.placeholder(tf.float32, shape=(None, 17))
                self.x_length = tf.placeholder(tf.int32,
                                               shape=(None, hp.max_turn))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.x_emotion = tf.placeholder(tf.int32,
                                                shape=(None, hp.max_turn))
                self.y_emotion = tf.placeholder(tf.int32, shape=(None, ))
                self.speaker = tf.placeholder(tf.int32, shape=(None, ))
                self.A = tf.placeholder(tf.float32, shape=(None, 7, 90, 90))
                self.x_turn_number = tf.placeholder(tf.int32, shape=(None, ))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            # de2idx, idx2de = load_de_vocab(hp)
            en2idx, idx2en = load_en_vocab(hp)
            speaker_memory = tf.get_variable(
                'speaker_memory',
                dtype=tf.float32,
                shape=[13, hp.hidden_units],
                initializer=tf.contrib.layers.xavier_initializer())
            emotion_memory = tf.get_variable(
                'emotion_memory',
                dtype=tf.float32,
                shape=[7, hp.hidden_units],
                initializer=tf.contrib.layers.xavier_initializer())
            outputs_speaker = tf.nn.embedding_lookup(speaker_memory,
                                                     self.speaker)
            outputs_speaker_ = tf.tile(tf.expand_dims(outputs_speaker, 1),
                                       [1, 50, 1])
            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                embeddingsize = hp.hidden_units / 2
                self.enc_embed = embedding(
                    tf.reshape(
                        self.x,
                        [-1, hp.maxlen
                         ]),  #batch_size*max_turn=240 shape=(240, 50, 256)
                    vocab_size=len(de2idx),
                    num_units=embeddingsize,
                    scale=True,
                    scope="enc_embed")
                single_cell = tf.nn.rnn_cell.GRUCell(hp.hidden_units)
                self.rnn_cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] *
                                                            hp.num_layers)
                print(self.enc_embed.get_shape())
                self.sequence_length = tf.reshape(self.x_length, [-1])
                print(self.sequence_length.get_shape())
                self.uttn_outputs, self.uttn_states = tf.nn.dynamic_rnn(
                    cell=self.rnn_cell,
                    inputs=self.enc_embed,
                    sequence_length=self.sequence_length,
                    dtype=tf.float32,
                    swap_memory=True)

                print(hp.batch_size, hp.max_turn, hp.hidden_units)

                self.enc = tf.reshape(
                    self.uttn_states,
                    [hp.batch_size, hp.max_turn, hp.hidden_units
                     ])  #shape=(16, 15, 512)

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,  #shape=(32, 15, 512), 
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                print('self.enc=', self.enc)
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc, _ = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)

                        ### Feed Forward
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units,
                                       hp.hidden_units])  #shape=(32, 15, 512),
                        #code.interact(local=locals())
            matrix = tf.get_variable("transform", [
                self.x_image.shape.as_list()[-1],
                self.enc.shape.as_list()[-1]
            ],
                                     dtype=tf.float32)
            self.x_ima = tf.map_fn(lambda x: tf.matmul(x, matrix),
                                   self.x_image,
                                   dtype=tf.float32)
            #code.interact(local=locals())
            self.enc = tf.concat((self.enc, self.x_ima), -2)
            s_m = tf.tile(tf.expand_dims(speaker_memory, 0),
                          [hp.batch_size, 1, 1])
            e_m = tf.tile(tf.expand_dims(emotion_memory, 0),
                          [hp.batch_size, 1, 1])
            self.enc = tf.concat((self.enc, e_m), -2)
            self.enc = tf.concat((self.enc, s_m), -2)
            self.H1 = HGraph(256, activation='relu')([self.enc, self.A])
            self.H1 = Dropout(hp.dropout_rate)(self.H1)
            self.H2 = HGraph(256, activation='relu')([self.H1, self.A])
            self.enc = Dropout(hp.dropout_rate)(self.H2)
            self.enc = tf.map_fn(lambda x: x, self.enc, dtype=tf.float32)
            self.enc = feedforward(
                self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units])
            with tf.variable_scope("emotion"):
                x3 = tf.reduce_max(self.enc, axis=1)
                self.emotion_logits = linear(x3,
                                             7,
                                             True,
                                             False,
                                             scope="softmax")
                outputs_emotion = tf.matmul(self.emotion_logits,
                                            emotion_memory)
                outputs_emotion_ = tf.tile(tf.expand_dims(outputs_emotion, 1),
                                           [1, 50, 1])  #shape=(50, 50, 128)

            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                ## Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                print('self.dec', self.dec)  #shape=(50, 50, 512)
                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec, _ = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")
                        print('self.dec', self.dec)  #shape=(50, 50, 512)
                        ## Multihead Attention ( vanilla attention)
                        self.dec, self.attn = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")
                        ## Feed Forward
                        print('self.dec', self.dec)  #shape=(50, 50, 512)
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])
                        #code.interact(local=locals())

            self.dec_emo = tf.concat((outputs_emotion_, outputs_speaker_), -1)
            self.dec_emo_spe = tf.concat((self.dec, self.dec_emo), -1)
            g = tf.nn.sigmoid(
                layer_norm(linear(self.dec_emo_spe,
                                  256,
                                  False,
                                  False,
                                  scope="context_gate"),
                           name="context_gate_ln"))

            self.dec_emo_spe = self.dec + g * outputs_emotion_ + (
                1 - g) * outputs_speaker_
            self.dec_emo_spe = tf.layers.dropout(
                self.dec_emo_spe,  #shape=(32, 50, 512), 
                rate=hp.dropout_rate,
                training=tf.convert_to_tensor(is_training))
            # Final linear projection
            self.logits = tf.layers.dense(self.dec_emo_spe,
                                          len(en2idx))  #shape=(128, 50, 5124)
            self.preds = tf.to_int32(tf.arg_max(
                self.logits, dimension=-1))  #shape=(128, 50)
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)

            #            if is_training:
            # Loss
            self.y_smoothed = label_smoothing(
                tf.one_hot(self.y, depth=len(en2idx)))
            self.loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.y_smoothed)  #shape=(256, 50)
            self.mean_loss = tf.reduce_sum(
                self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

            if is_training:
                tgt_emotion = label_smoothing(
                    tf.one_hot(self.y_emotion, depth=7))
                emotion_loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.emotion_logits, labels=tgt_emotion)
                emotion_loss = tf.reduce_mean(emotion_loss)
                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    (1 - hp.alpha) * self.mean_loss + hp.alpha * emotion_loss,
                    global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss + emotion_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 15
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        # de2idx, idx2de = load_doc_vocab()
        # self.vocab_size = len(de2idx)
        self.vocab_size = len(load_doc_vocab()[0])

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32,
                                        shape=(None, hp.article_maxlen))
                self.y = tf.placeholder(tf.int32,
                                        shape=(None, hp.summary_maxlen))

            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2:<S> # define decoder inputs

            self._add_encoder(is_training=is_training)
            self.ml_loss = self._add_ml_loss(is_training=is_training)

            if is_training:
                self.eta = tf.Variable(initial_value=hp.eta_init,
                                       dtype=tf.float32,
                                       trainable=False,
                                       name='eta')
                # disallow eta to be updated by loss
                self.update_eta = tf.assign(self.eta, self.eta + 0.1)

                self.rl_loss = self._add_rl_loss()
                self.loss = self.eta * self.rl_loss + (1 -
                                                       self.eta) * self.ml_loss

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)

                grads_and_vars_mix = self.optimizer.compute_gradients(
                    loss=self.loss)
                grads_and_vars_ml = self.optimizer.compute_gradients(
                    loss=self.ml_loss)

                grad_mix, vars_mix = zip(
                    *grads_and_vars_mix)  # parse grad and var
                grad_ml, vars_ml = zip(
                    *grads_and_vars_ml)  # parse grad and var

                # add gradient clipping
                clipped_grad_mix, globle_norm_mix = tf.clip_by_global_norm(
                    grad_mix, hp.maxgradient)
                clipped_grad_ml, globle_norm_ml = tf.clip_by_global_norm(
                    grad_ml, hp.maxgradient)
                self.globle_norm_ml = globle_norm_ml
                self.train_op_mix = self.optimizer.apply_gradients(
                    grads_and_vars=zip(clipped_grad_mix, vars_mix),
                    global_step=self.global_step)
                self.train_op_ml = self.optimizer.apply_gradients(
                    grads_and_vars=zip(clipped_grad_ml, vars_ml),
                    global_step=self.global_step)
                '''
                # below: training wihtout gradient clipping
                self.train_op_mix = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_mix,
                                                                   global_step=self.global_step)
                self.train_op_ml  = self.optimizer.apply_gradients(grads_and_vars=grads_and_vars_ml,
                                                                   global_step=self.global_step)
                '''

                # Summary
                tf.summary.scalar('globle_norm_ml', globle_norm_ml)
                tf.summary.histogram(name='reward_diff',
                                     values=self.reward_diff)
                tf.summary.histogram(name='clipped_reward_diff',
                                     values=self.clipped_reward_diff)
                tf.summary.scalar('rl_loss', self.rl_loss)
                tf.summary.scalar('ml_loss', self.ml_loss)
                tf.summary.scalar('loss', self.loss)
                self.merged = tf.summary.merge_all()

                # prepare the Saver that restore all variables other than eta
                all_var = tf.get_collection(key=tf.GraphKeys.GLOBAL_VARIABLES)
                all_var.remove(self.eta)
                self.subset_saver = tf.train.Saver(var_list=all_var)

        self.filewriter = tf.summary.FileWriter(hp.tb_dir + '/train',
                                                self.graph)
Esempio n. 16
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Load data
            self.x, self.y, self.num_batch = get_batch_data()  # (N, T)

            # Load vocabulary
            char2idx, idx2char = load_vocab()

            # Encoder
            ## Embedding
            enc = embedding(self.x,
                            vocab_size=len(char2idx),
                            num_units=hp.hidden_units,
                            scale=False,
                            scope="enc_embed")

            # Encoder pre-net
            prenet_out = prenet(
                enc,
                num_units=[hp.hidden_units, hp.hidden_units // 2],
                dropout_rate=hp.dropout_rate,
                is_training=is_training)  # (N, T, E/2)

            # Encoder CBHG
            ## Conv1D bank
            enc = conv1d_banks(prenet_out,
                               K=hp.encoder_num_banks,
                               num_units=hp.hidden_units // 2,
                               norm_type="ins",
                               is_training=is_training)  # (N, T, K * E / 2)

            ### Max pooling
            enc = tf.layers.max_pooling1d(enc, 2, 1,
                                          padding="same")  # (N, T, K * E / 2)

            ### Conv1D projections
            enc = conv1d(enc, hp.hidden_units // 2, 3,
                         scope="conv1d_1")  # (N, T, E/2)
            enc = normalize(enc,
                            type="ins",
                            is_training=is_training,
                            activation_fn=tf.nn.relu)
            enc = conv1d(enc, hp.hidden_units // 2, 3,
                         scope="conv1d_2")  # (N, T, E/2)
            enc += prenet_out  # (N, T, E/2) # residual connections

            ### Highway Nets
            for i in range(hp.num_highwaynet_blocks):
                enc = highwaynet(
                    enc,
                    num_units=hp.hidden_units // 2,
                    scope='highwaynet_{}'.format(i))  # (N, T, E/2)

            ### Bidirectional GRU
            enc = gru(enc, hp.hidden_units // 2, True)  # (N, T, E)

            # Final linear projection
            self.logits = tf.layers.dense(enc,
                                          2)  # 0 for non-space, 1 for space

            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.x, 0))  # masking
            self.num_hits = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget)
            self.num_targets = tf.reduce_sum(self.istarget)
            self.acc = self.num_hits / self.num_targets

            if is_training:
                # Loss
                self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)
Esempio n. 17
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data(
                )  # shape=[batch_size, max_seq_len]
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len))
            # decoder_inputs
            '''decoder_inputs和self.y相比,去掉了最后一个句子结束符,而在每句话最前面加了一个初始化为2的id,即<S> ,代表开始。'''
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), axis=-1)
            # load_vocab
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            # encoder
            with tf.variable_scope('encoder'):
                # input - word embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     d_model=hp.d_model,
                                     scale=True,
                                     scope='enc_embed')
                # input - positional encoding
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.max_seq_len,
                                      d_model=hp.d_model,
                                      zero_pad=False,
                                      scale=False,
                                      scope='enc_pe')
                # Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                # 3. num_layers multi-head attention
                for i in range(hp.num_layers):
                    with tf.variable_scope('num_layers_{}'.format(i)):
                        # multi head attention + Add and Norm
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        # feed forward + Add and Norm
                        self.enc = feedforward(
                            self.enc, dff=[4 * hp.d_model, hp.d_model])

            # decoder
            with tf.variable_scope('decoder'):
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     d_model=hp.d_model,
                                     scale=True,
                                     scope='dec_embed')
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      vocab_size=hp.max_seq_len,
                                      d_model=hp.d_model,
                                      zero_pad=False,
                                      scale=False,
                                      scope='dec_pe')
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))
                for i in range(hp.num_layers):
                    with tf.variable_scope('num_layers_{}'.format(i)):
                        # masked multi-head attention
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope='self-attention')
                        # multi-head attention
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            d_model=hp.d_model,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope='vanilla-attention')
                        self.dec = feedforward(
                            self.dec,
                            dff=[4 * hp.d_model, hp.d_model
                                 ])  # shape=[batch_size, seq_len, d_model]

            # final linear projection
            self.logits = tf.layers.dense(
                self.dec,
                len(en2idx))  # shape=[batch_size, seq_len, target_vocab_size]
            self.preds = tf.to_int32(tf.arg_max(
                self.logits, dimension=-1))  # 预测值 shape=[batch_size, seq_len]
            self.istarget = tf.to_float(tf.not_equal(
                self.y, 0))  # 真实值 shape=[batch_size, seq_len]
            # pad 部分不参与准确率计算
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / tf.reduce_sum(self.istarget)
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # loss
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                # pad 部分不参与损失计算
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))
                # training scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)
                # summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 18
0
                self.prob_c = tf.nn.softmax(self.logits_c)  # (N, T_q, vocab_size)
                self.prob_t = tf.nn.softmax(self.logits_t)  # (N, T_q, tw_vocab_size)
                self.prob_t = tf.einsum('nlt,tv->nlv', self.prob_t, self.tw_vocab_overlap)  # (N, T_q, vocab_size)
                self.prob = self.prob_c + self.prob_t * hp.penalty # (N, T_q, vocab_size)
                self.preds = tf.to_int32(tf.argmax(self.prob, axis=-1))  # (N, T_q)



if __name__ == '__main__':
    # Load vocabulary
    token2idx, idx2token = load_de_en_vocab()
    tw2idx, idx2tw = load_tw_vocab()
    token2idx_len = len(token2idx)
    tw2idx_len = len(tw2idx)

    X, X_length, Y, YTWD, Y_DI, TW, num_batch = get_batch_data()

    # Construct graph
    g = Graph(True, token2idx_len, tw2idx_len, None)
    print("Graph loaded")

    # Start session
    sv = tf.train.Supervisor(graph=g.graph, 
                             logdir=hp.logdir,
                             save_model_secs=0)


    with sv.managed_session() as sess:
        for epoch in range(1, hp.num_epochs+1): 
            if sv.should_stop(): break
            loss=[]
Esempio n. 19
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()  # (N, T)
            else:  # inference
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

            # Load vocabulary
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            # Encoder
            with tf.variable_scope("encoder"):
                ## Embedding
                self.enc = embedding(self.x,
                                     vocab_size=len(de2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="enc_embed")

                key_masks = tf.expand_dims(
                    tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="enc_pe")
                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                self.enc *= key_masks

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### Multihead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)

                        ### Feed Forward
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Decoder
            with tf.variable_scope("decoder"):
                ## Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                key_masks = tf.expand_dims(
                    tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")
                self.dec *= key_masks

                ## Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)

            if is_training:
                # Loss
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                # Training Scheme
                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                # Summary
                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 21
0
    print("Create preprocessed data.....")
    make_vocab(FLAGS.source_train, "input.vocab")
    make_vocab(FLAGS.target_train, "output.vocab")
    print("....Done\n")


    # Load vocabulary    
    input2idx, idx2input = load_input_vocab()
    output2idx, idx2output = load_output_vocab()
    
    # Construct graph
    g = Graph("train")
    print("Graph loaded\n")

    print("Loading batch data.....")
    x, y, _ = get_batch_data()
    print(len(x))
    print(len(y))
    print("........Done")

    x = np.array(x)
    y = np.array(y)


    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables())

        batches = batch_iter_seq2seq(x, y, FLAGS.batch_size, FLAGS.num_epochs)
        print("num batches")