Esempio n. 1
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 2
0
 def loss_function(self, inputs):
     logits, label = inputs
     istarget = tf.to_float(tf.not_equal(label, 0))
     y_smoothed = label_smoothing(tf.one_hot(label, depth=self.out_vocab_len))
     loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_smoothed)
     mean_loss = tf.reduce_sum(loss * istarget) / (tf.reduce_sum(istarget))
     return mean_loss
Esempio n. 3
0
    def train(self, xs, ys):
        # forward
        loss_weight = ys[-1]
        ys = ys[:-1]
        
        memory, sents1 = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]))  # 0: <pad>
        a = ce * nonpadding
        print ('loss_weight1.shape', loss_weight.shape)
        print ('a.shape', a.shape)
        a = ce * nonpadding * (1 + loss_weight)
        b = nonpadding
        loss = tf.reduce_sum(a) / (tf.reduce_sum(b) + 1e-7)
        
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(preds, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9, beta2=0.997, epsilon=1e-9)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 4
0
 def init(self):
     with self.graph.as_default():
         self.build_model()
         
         self.istarget = tf.to_float(tf.not_equal(self.y, 0))
         norm_len = tf.reduce_sum(self.istarget)
         equal_val = tf.to_float(tf.equal(self.preds, self.y))
         self.acc = tf.reduce_sum(equal_val*self.istarget)/ norm_len
         tf.summary.scalar('acc', self.acc)
         tf.summary.scalar('target_norm_len', norm_len)
         
         if self.is_training:
             # Loss
             self.y_smoothed = label_smoothing(
                 tf.one_hot(self.y, depth=len(self.en2idx)))
             self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                 logits=self.logits, labels=self.y_smoothed)
             a = tf.reduce_sum(self.loss * self.istarget)
             self.loss = a / (tf.reduce_sum(self.istarget))
             
             # Training Scheme
             self.get_train_op()
             
             # Summary
             tf.summary.scalar('mean_loss', self.loss)
             self.merged = tf.summary.merge_all()
Esempio n. 5
0
    def train(self, xs, decode_inputs, y):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, src_masks, outputs, scopes = self.encode(xs)
        dec, outputs1, scopes1 = self.decode(decode_inputs, memory, src_masks)
        # Final linear projection (embedding weights are shared)

        outputs = outputs + outputs1
        scopes = scopes + scopes1
        with tf.variable_scope("logits", reuse=tf.AUTO_REUSE):
            weights = tf.transpose(self.embeddings)  # (d_model, vocab_size)
            logits = tf.einsum('ntd,dk->ntk', dec,
                               weights)  # (N, T2, vocab_size)
            y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
            loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                              labels=y_)
            #nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]))  # 0: <pad>
            #loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(loss)

        return loss, outputs, scopes
Esempio n. 6
0
    def train(self, xs, ys):
        # Forward
        memory, sents1 = self.encode(xs)
        logits, preds, y, sent2 = self.decode(ys, memory)

        # Train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx['<PAD>']))
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('global_step', global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 7
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        if self.hp.io_tie and self.hp.ortho_embedding:

            lmb = self.hp.ortho_lambda
            normlevel = self.hp.ortho_reg_norm
            if not self.hp.fac_embed:

                real_embedding = self.embeddings[1:, :]
                if not (self.hp.norm_embedding
                        or self.embedding_normalization):
                    loss = loss + (tf.norm(tf.subtract(
                        tf.matmul(tf.transpose(real_embedding),
                                  real_embedding),
                        tf.scalar_mul(tf.constant(2.0, dtype=tf.float32),
                                      tf.eye(self.hp.d_model))),
                                           ord=normlevel)**2) * lmb
                else:
                    wtw = tf.matmul(tf.transpose(real_embedding),
                                    real_embedding)
                    wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw))
                    loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))**
                                   2) * lmb
            else:
                loss = loss + (tf.norm(tf.subtract(
                    tf.matmul(self.embeddings2, tf.transpose(
                        self.embeddings2)), tf.eye(self.hp.d_embed)),
                                       ord=normlevel)**2) * lmb
                #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 8
0
    def train(self, xs, ys, x_paraphrased_dict, synonym_label=None):
        # forward
        memory, sents1 = self.encode(xs)
        _, _, synonym_label_loss = self.labeling(synonym_label, memory)
        logits, preds, y, sents2 = self.decode(ys, x_paraphrased_dict, memory)

        # train scheme
        # generation loss
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        # multi task loss
        tloss = self.hp.l_alpha * loss + (1.0-self.hp.l_alpha) * synonym_label_loss

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(tloss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("tloss", tloss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 9
0
    def build_model(self, inputs,labels=None):
        batch_size = tf.shape(inputs)[0]
        # forward
        memory = self.encode(inputs,self.train_mode)
        
        
        if self.train_mode or labels is not None:
            decoder_inputs = labels[:,:-1]
            decoder_targets = labels[:,1:]
            logits, self.preds = self.decode(decoder_inputs, memory,self.train_mode)  # self.preds <--- argmax 취한 값
            
            
            # train scheme
            if self.hp.label_smoothing:
                y_ = label_smoothing(tf.one_hot(decoder_targets, depth=self.hp.VOCAB_SIZE))
                ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)  # lables(one-hot)
                nonpadding = tf.to_float(tf.not_equal(decoder_targets, self.token2idx["<pad>"]))  # 0: <pad>
                self.loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
            else:
                weights = tf.ones(shape=[batch_size,self.hp.OUTPUT_LENGTH])
                self.loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=decoder_targets, weights=weights)   # targets(not one-hot)

        else:
    
            init_decoder_inputs = tf.ones((batch_size, 1), tf.int32) * self.token2idx["<sos>"]
            decoder_inputs = init_decoder_inputs
    
            for _ in range(self.hp.OUTPUT_LENGTH):
                logits, y_hat, = self.decode(decoder_inputs, memory, training=False)    
                decoder_inputs = tf.concat([init_decoder_inputs,y_hat],axis=-1)  # 생성된 것의 마지막만 살리는냐? 전체를 살리느냐?
            self.preds = y_hat
Esempio n. 10
0
    def __init__(self, hp):
        self.hp = hp
        self.token2idx, self.idx2token = load_vocab(hp.vocab)
        self.embeddings = get_token_embeddings(self.hp.vocab_size,
                                               self.hp.d_model,
                                               zero_pad=True)

        self.input_x = tf.placeholder(dtype=tf.int32,
                                      shape=(None, None),
                                      name="input_x")
        self.decoder_input = tf.placeholder(dtype=tf.int32,
                                            shape=(None, None),
                                            name="decoder_input")
        self.target = tf.placeholder(dtype=tf.int32,
                                     shape=(None, None),
                                     name="target")
        self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")

        # encoder
        self.encoder_hidden = self.encode(self.input_x,
                                          training=self.is_training)

        # decoder
        self.logits = self.decode(self.decoder_input,
                                  self.encoder_hidden,
                                  training=self.is_training)

        self.y_hat = tf.to_int32(tf.argmax(self.logits, axis=-1),
                                 name="y_predict_v2")

        # loss
        self.smoothing_y = label_smoothing(
            tf.one_hot(self.target, depth=self.hp.vocab_size))
        self.ce_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.logits, labels=self.smoothing_y)
        nonpadding = tf.to_float(
            tf.not_equal(self.target, self.token2idx["<pad>"]))
        self.loss = tf.reduce_sum(
            self.ce_loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        # optimize
        self.global_step = tf.train.get_or_create_global_step()
        self.lr = noam_scheme(self.hp.lr, self.global_step,
                              self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.minimize(self.loss,
                                           global_step=self.global_step)

        # tensorboard
        tf.summary.scalar('lr', self.lr)
        tf.summary.scalar("loss", self.loss)
        tf.summary.scalar("global_step", self.global_step)
        self.summaries = tf.summary.merge_all()

        # predict part
        self.y_predict = tf.identity(self.greedy_search(), name="y_predict")
Esempio n. 11
0
 def _loss_op(self, l2_lambda=0.0001):
     with tf.name_scope('cost'):
         self.y_smoothed = label_smoothing(self.y)
         losses = tf.nn.sigmoid_cross_entropy_with_logits(
             labels=self.y_smoothed, logits=self.logits)
         loss = tf.reduce_mean(losses, name='loss_val')
         weights = [
             v for v in tf.trainable_variables()
             if ('w' in v.name) or ('kernel' in v.name)
         ]
         l2_loss = tf.add_n([tf.nn.l2_loss(w) for w in weights]) * l2_lambda
         loss += l2_loss
     return loss
Esempio n. 12
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1 = self.encode(xs)
        # memory = tf.Print(memory, [memory], message='memory =', summarize=10)
        logits, preds, y, sents2 = self.decode(ys, memory)
        # logits = tf.Print(logits, [logits], message='logits =', summarize=10)

        print('train logits.shape, y.shape =', logits.shape, y.shape)
        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        # logits = tf.Print(logits, [logits], message='logits =', summarize=10)
        # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10)
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # nonpadding = tf.Print(nonpadding, [nonpadding], message='nonpadding =',
        #     summarize=100)
        # nonpadding_print = tf.print('nonpadding =', tf.shape(nonpadding)
        #     , summarize=20)
        # with tf.control_dependencies([nonpadding_print]):
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        # loss = tf.Print(loss, [loss], message='loss =', summarize=10)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        # gradients = optimizer.compute_gradients(loss)
        # # print_grad = tf.print('gradients =', gradients, summarize=10)
        # # with tf.control_dependencies([print_grad]):
        # clip_grads = [(tf.clip_by_value(grad, -100., 100.), var) for grad, var in gradients]
        # train_op = optimizer.apply_gradients(clip_grads, global_step=global_step)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 13
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        # y:(N, T2)值:[[ 5768  7128  7492  7128  7492  4501 7128  7128 14651],[ 5768  7128  7492  7128  7492  4501 7128  7128 14651]]
        # y_:(N, T2, vocab_size);  值:(N, T2,[0,0.999,0,.....,0]
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        # 预测值和label做交叉熵,生成损失值
        # logits:预测id的概率 (N, T2,[0,0.999,0,.....,0])
        # ce: (N,T2) 例如:(4, 42)  值:array([[ 6.8254533,  6.601975 ,  6.5515084...,9.603574 , 10.001306 ],[6.8502007,  6.645137...]】,每个字粒度的损失
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        # nonpadding:(N,T2) 例如:(4, 42) 值:[[[1., 1., 1.,0,0,0],[1., 1., 1., 1., 1., 1.,.....]
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # tf.reduce_sum 按照某一维度求和 不指定axis,默认所有维度
        # ce * nonpadding:只求没有填充的词的损失,padding的去掉了 tf.reduce_sum(nonpadding):个数相加为了求平均
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        # 根据训练步数,动态改变学习率
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        #定义优化器
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 14
0
    def train(self, xs, ys):  # 用于训练模型
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        # 调用decode()和encode()来获取个部分的输出结果
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        # 利用one_hot表示每个词的索引在整个词表中的位置, 相当于构建出了要训练的目标Label,
        # 这里就是要使logits的最终结果,即vocab_size大小的向量中,目标词汇所在位置(索引)的值尽可能的大,而使其他位置的值尽可能的小。
        # 构造出了输出和标签之后,就使用tf.nn.softmax_cross_entropy_with_logits()进行训练
        y_ = label_smoothing(tf.one_hot(
            y,
            depth=self.hp.vocab_size))  # label_smoothing函数用来进行one hot函数的平滑处理
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        # 在计算Loss之前,还要进行一定的处理。由于一开始对有些句子长度不够maxlen的进行了padding,所以在计算Loss的时候,将这些位置上的误差清0
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # loss函数用到了交叉熵函数,但是在计算的时候去掉了padding的影响。
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        #对学习率进行调整,用到了warmup操作,初始阶段lr逐渐上升,迭代后期则逐渐下降
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)
        # 最后即是利用了AdadeltaOptimizer优化器对loss进行优化。
        # tf.summary.scalar()函数即是以key - value的形式保存数值,可以用于TensorBoard中对数据的可视化展示。
        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 15
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # 构建encoder和decoder
        memory, sents1 = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory)

        # train scheme
        y_ = label_smoothing(tf.one_hot(
            y, depth=self.hp.vocab_size))  # batch_size*T*vocab_size
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits, labels=y_)  #  logits 未经过softmax处理,因为函数内部进行了处理,
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])
                                 )  # 0: <pad> 相当于mask,不等长序列最后计算loss要剔除padding项
        # 计算整个batch内的平均loss,1e-7防止分母为0的情况发生
        # ce * nonpadding 只计算非padding的 loss
        # 分母为 tf.reduce_sum(nonpadding) 表示计算整个batch的平均loss
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Esempio n. 16
0
    def batch_split_train(self, xs, ys, split_num=4):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        #xs_split=tf.split(xs,gpu_num)
        #ys_split=tf.split(ys,gpu_num)
        #print(xs)
        #print(ys)
        #xs_split=[]
        #ys_split=[]
        #batchsize=self.hp.batch_size
        '''
        divided_batch_size=batchsize//split_num
        for i in range(split_num):
            start=divided_batch_size*i
            end=start+divided_batch_size
            xs_split.append((xs[0][start:end],xs[1][start:end],xs[2][start:end]))
            ys_split.append((ys[0][start:end],ys[1][start:end],ys[2][start:end],ys[3][start:end]))
        '''
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step // split_num,
                         self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        #models=[]
        #for i in range(split_num):

        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)
        # train scheme

        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        if self.hp.io_tie and self.hp.ortho_embedding:
            lmb = self.hp.ortho_lambda
            normlevel = self.hp.ortho_reg_norm
            if not self.hp.fac_embed:
                real_embedding = self.embeddings[1:, :]
                if not (self.hp.norm_embedding
                        or self.embedding_normalization):
                    loss = loss + (tf.norm(tf.subtract(
                        tf.matmul(tf.transpose(real_embedding),
                                  real_embedding),
                        tf.scalar_mul(tf.constant(2.0, dtype=tf.float32),
                                      tf.eye(self.hp.d_model))),
                                           ord=normlevel)**2) * lmb
                else:

                    wtw = tf.matmul(tf.transpose(real_embedding),
                                    real_embedding)
                    wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw))
                    loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))**
                                   2) * lmb
            else:
                loss = loss + (tf.norm(tf.subtract(
                    tf.matmul(self.embeddings2, tf.transpose(
                        self.embeddings2)), tf.eye(self.hp.d_embed)),
                                       ord=normlevel)**2) * lmb
                #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this

        grads = optimizer.compute_gradients(loss)
        self.steps.append((loss, grads))
        if len(self.steps) == split_num:
            tower_losses, tower_grads = zip(*self.steps)

            train_op = optimizer.apply_gradients(
                average_gradients(tower_grads), global_step=global_step)
            self.steps = []
        else:
            train_op = optimizer.apply_gradients([], global_step=global_step)
        #aver_loss=tf.reduce_mean(tower_losses)
        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)
        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Esempio n. 17
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 18
0
    def __init__(self, training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            if training:
                self.x, self.y, self.num_batch = get_batch()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.max_len))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.max_len))
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_data.load_vocab(
                './preprocessed/de.vocab.tsv')
            en2idx, idx2en = load_data.load_vocab(
                './preprocessed/en.vocab.tsv')

            self.embedding = get_token_embeddings(len(de2idx),
                                                  hp.hidden_units,
                                                  zero_pad=True)

            with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
                self.enc = tf.nn.embedding_lookup(self.embedding, self.x)
                # scale
                self.enc *= hp.hidden_units**0.5
                # positional encoding
                self.enc += positional_encoding(self.enc)
                self.enc = tf.layers.dropout(self.enc,
                                             hp.dropout_rate,
                                             training=training)
                for i in range(hp.num_blocks):
                    with tf.variable_scope('num_blocks_{}'.format(i),
                                           reuse=tf.AUTO_REUSE):
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            values=self.enc,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            training=training,
                            causality=False)
                        self.enc = ff(self.enc,
                                      num_units=[hp.d_ff, hp.hidden_units])

            with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
                self.dec = tf.nn.embedding_lookup(self.embedding,
                                                  self.decoder_inputs)
                self.dec *= hp.hidden_units**0.5
                self.dec += positional_encoding(self.dec)
                self.dec = tf.layers.dropout(self.dec,
                                             hp.dropout_rate,
                                             training=training)
                for i in range(hp.num_blocks):
                    with tf.variable_scope('num_block_{}'.format(i),
                                           reuse=tf.AUTO_REUSE):
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            values=self.dec,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            training=training,
                            causality=True,
                            scope='self_attention')
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            values=self.enc,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            training=training,
                            causality=False,
                            scope='vanilla_attention')
                        self.dec = ff(self.dec,
                                      num_units=[hp.d_ff, hp.hidden_units])
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) *
                self.istarget) / (tf.reduce_sum(self.istarget))
            tf.summary.scalar('acc', self.acc)
            if training:
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / tf.reduce_sum(self.istarget)

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 19
0
    def build(self):
        position_encoding_outputs = modules.position_encoding(
            self.x_input, args.position_size)
        if args.position_encoding_type == 'add':
            outputs = position_encoding_outputs + self.x_input
        if args.position_encoding_type == "concat":
            outputs = tf.concat([self.x_input, position_encoding_outputs],
                                axis=2)

        for i in range(6):
            sublayer1 = modules.multi_head_attention(
                outputs,
                outputs,
                outputs,
                args.head_num,
                args.head_size,
                self.dropout,
                self.training,
                type=args.attention_unit_type)
            self.mhas.append(sublayer1)
            outputs = modules.residual_connection(outputs, sublayer1,
                                                  self.training)
            sublayer2 = modules.feed_forward(outputs, args.feed_forward_size,
                                             self.dropout, self.training)
            outputs = modules.residual_connection(outputs, sublayer2,
                                                  self.training)

        outputs = tf.layers.dense(outputs,
                                  1,
                                  use_bias=True,
                                  name='last_output')
        outputs = tf.squeeze(outputs, -1)  # (batch_size, seqlen)
        outputs = tf.layers.dense(outputs, args.nlabel, name='output_logit')

        self.logits = outputs
        self.logits_softmax = tf.nn.softmax(outputs,
                                            name='output_logit_softmax')

        if self.training is not None:
            util.params_usage(tf.trainable_variables())

        y = tf.one_hot(self.y_true, args.nlabel)
        self.y_smooth = modules.label_smoothing(y)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y_smooth,
                                                          logits=self.logits)
        self.loss = tf.reduce_mean(loss)
        self.global_step = tf.train.get_or_create_global_step()
        self.lr = modules.noam_scheme(args.eta,
                                      global_step=self.global_step,
                                      warmup_steps=args.warmup)
        train_op = tf.compat.v1.train.AdamOptimizer(self.lr).minimize(
            self.loss, global_step=self.global_step)
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS)
        self.train_op = tf.group([train_op, update_ops])

        self.y_pred = tf.argmax(self.logits_softmax, axis=1, name="y_pred")
        pred_prob = tf.equal(tf.cast(self.y_pred, tf.int32), self.y_true)
        self.accuracy = tf.reduce_mean(tf.cast(pred_prob, tf.float32),
                                       name="accuracy")

        tf.compat.v1.summary.scalar('accuracy', self.accuracy)
        tf.compat.v1.summary.scalar('loss', self.loss)
        tf.compat.v1.summary.scalar('learning rate', self.lr)
        self.merged_summary_op = tf.compat.v1.summary.merge_all()
Esempio n. 20
0
    def eval(self, xs, ys):
        '''Predicts autoregressively
        At inference, input ys is ignored.
        Returns
        y_hat: (N, T2)
        '''
        decoder_inputs, y, y_seqlen, sents2 = ys
        # decoder_inputs (N, 1)
        decoder_inputs = tf.ones(
            (tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx["<s>"]
        ys = (decoder_inputs, y, y_seqlen, sents2)

        memory, sents1 = self.encode(xs, False)

        logging.info("Inference graph is being built. Please be patient.")
        for _ in tqdm(range(self.hp.maxlen2)):
            logits, y_hat, y, sents2 = self.decode(ys, memory, False)
            # if tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"] or \
            #     tf.reduce_sum(y_hat, 1) == self.token2idx["<s>"]: break
            # # # print('y_hat.shape = ', y_hat.shape)

            _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)

            # print('_decoder_inputs.shape =', _decoder_inputs.shape)
            _decoder_inputs = tf.cond(
                tf.cast(
                    tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"],
                    tf.bool), lambda: _decoder_inputs, lambda: tf.concat(
                        (decoder_inputs, y_hat), 1))
            ys = (_decoder_inputs, y, y_seqlen, sents2)
            # print('ys =', ys)
        # loss
        # logits, y_hat, y, sents2 = self.decode(ys, memory, False)
        # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)

        # def cond(_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits):
        #     return tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"] or \
        #     tf.reduce_sum(y_hat, 1) == self.token2idx["<s>"]
        # def body(_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits):
        #     _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
        #     ys = (_decoder_inputs, y, y_seqlen, sents2)
        #     logits, y_hat, y, sents2 = self.decode(ys, memory, False)
        #     return _decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits

        # _decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits = \
        #     tf.while_loop(cond, body,
        #     [_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits],
        #     shape_invariants=[
        #     tf.TensorShape([None, None]), y.get_shape(), y_seqlen.get_shape(),
        #     sents2.get_shape(), memory.get_shape(), tf.TensorShape([None, None]),
        #     tf.TensorShape([None, None, self.hp.vocab_size])
        #     ])

        shape_pri = tf.print('eval logits.shape, y.shape =', tf.shape(logits),
                             tf.shape(y))
        # with tf.control_dependencies([shape_pri]):
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        # logits = tf.Print(logits, [logits], message='logits =', summarize=10)
        # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10)
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits[:, :tf.shape(y_)[1], :], labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        # monitor a random sample
        n = tf.random_uniform((), 0, tf.shape(y_hat)[0] - 1, tf.int32)
        sent1 = sents1[n]
        pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
        sent2 = sents2[n]

        tf.summary.text("sent1", sent1)
        tf.summary.text("pred", pred)
        tf.summary.text("sent2", sent2)
        summaries = tf.summary.merge_all()

        return y_hat, summaries, loss
Esempio n. 21
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
Esempio n. 22
0
    def construct_network(self):
        """
        Constructs a variant of the multi-head attention labeller (MHAL)
        that does not use keys, queries and values, but only a simple form
        of additive attention, as proposed by Yang et al. (2016).
        """
        self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids")
        self.char_ids = tf.placeholder(tf.int32, [None, None, None],
                                       name="char_ids")
        self.sentence_lengths = tf.placeholder(tf.int32, [None],
                                               name="sentence_lengths")
        self.word_lengths = tf.placeholder(tf.int32, [None, None],
                                           name="word_lengths")
        self.sentence_labels = tf.placeholder(tf.float32, [None],
                                              name="sentence_labels")
        self.word_labels = tf.placeholder(tf.float32, [None, None],
                                          name="word_labels")

        self.word_objective_weights = tf.placeholder(
            tf.float32, [None, None], name="word_objective_weights")
        self.sentence_objective_weights = tf.placeholder(
            tf.float32, [None], name="sentence_objective_weights")

        self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
        self.is_training = tf.placeholder(tf.int32, name="is_training")
        self.loss = 0.0

        if self.config["initializer"] == "normal":
            self.initializer = tf.random_normal_initializer(stddev=0.1)
        elif self.config["initializer"] == "glorot":
            self.initializer = tf.glorot_uniform_initializer()
        elif self.config["initializer"] == "xavier":
            self.initializer = tf.glorot_normal_initializer()

        zeros_initializer = tf.zeros_initializer()

        self.word_embeddings = tf.get_variable(
            name="word_embeddings",
            shape=[len(self.word2id), self.config["word_embedding_size"]],
            initializer=(zeros_initializer if self.config["emb_initial_zero"]
                         else self.initializer),
            trainable=(True if self.config["train_embeddings"] else False))
        word_input_tensor = tf.nn.embedding_lookup(self.word_embeddings,
                                                   self.word_ids)

        if self.config["char_embedding_size"] > 0 and self.config[
                "char_recurrent_size"] > 0:
            with tf.variable_scope("chars"), tf.control_dependencies([
                    tf.assert_equal(tf.shape(self.char_ids)[2],
                                    tf.reduce_max(self.word_lengths),
                                    message="Char dimensions don't match")
            ]):
                self.char_embeddings = tf.get_variable(
                    name="char_embeddings",
                    shape=[
                        len(self.char2id), self.config["char_embedding_size"]
                    ],
                    initializer=self.initializer,
                    trainable=True)
                char_input_tensor = tf.nn.embedding_lookup(
                    self.char_embeddings, self.char_ids)

                char_input_tensor_shape = tf.shape(char_input_tensor)
                char_input_tensor = tf.reshape(
                    char_input_tensor,
                    shape=[
                        char_input_tensor_shape[0] *
                        char_input_tensor_shape[1], char_input_tensor_shape[2],
                        self.config["char_embedding_size"]
                    ])
                _word_lengths = tf.reshape(self.word_lengths,
                                           shape=[
                                               char_input_tensor_shape[0] *
                                               char_input_tensor_shape[1]
                                           ])

                char_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(
                    self.config["char_recurrent_size"],
                    use_peepholes=self.config["lstm_use_peepholes"],
                    state_is_tuple=True,
                    initializer=self.initializer,
                    reuse=False)
                char_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(
                    self.config["char_recurrent_size"],
                    use_peepholes=self.config["lstm_use_peepholes"],
                    state_is_tuple=True,
                    initializer=self.initializer,
                    reuse=False)

                # Concatenate the final forward and the backward character contexts
                # to obtain a compact character representation for each word.
                _, ((_, char_output_fw),
                    (_, char_output_bw)) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw=char_lstm_cell_fw,
                        cell_bw=char_lstm_cell_bw,
                        inputs=char_input_tensor,
                        sequence_length=_word_lengths,
                        dtype=tf.float32,
                        time_major=False)

                char_output_tensor = tf.concat(
                    [char_output_fw, char_output_bw], axis=-1)
                char_output_tensor = tf.reshape(
                    char_output_tensor,
                    shape=[
                        char_input_tensor_shape[0], char_input_tensor_shape[1],
                        2 * self.config["char_recurrent_size"]
                    ])

                # Include a char-based language modelling loss, LMc.
                if self.config["lm_cost_char_gamma"] > 0.0:
                    self.loss += self.config["lm_cost_char_gamma"] * \
                                 self.construct_lm_cost(
                                     input_tensor_fw=char_output_tensor,
                                     input_tensor_bw=char_output_tensor,
                                     sentence_lengths=self.sentence_lengths,
                                     target_ids=self.word_ids,
                                     lm_cost_type="separate",
                                     name="lm_cost_char_separate")

                if self.config["lm_cost_joint_char_gamma"] > 0.0:
                    self.loss += self.config["lm_cost_joint_char_gamma"] * \
                                 self.construct_lm_cost(
                                     input_tensor_fw=char_output_tensor,
                                     input_tensor_bw=char_output_tensor,
                                     sentence_lengths=self.sentence_lengths,
                                     target_ids=self.word_ids,
                                     lm_cost_type="joint",
                                     name="lm_cost_char_joint")

                if self.config["char_hidden_layer_size"] > 0:
                    char_output_tensor = tf.layers.dense(
                        inputs=char_output_tensor,
                        units=self.config["char_hidden_layer_size"],
                        activation=tf.tanh,
                        kernel_initializer=self.initializer)

                if self.config["char_integration_method"] == "concat":
                    word_input_tensor = tf.concat(
                        [word_input_tensor, char_output_tensor], axis=-1)
                elif self.config["char_integration_method"] == "none":
                    word_input_tensor = word_input_tensor
                else:
                    raise ValueError("Unknown char integration method")

        if self.config["dropout_input"] > 0.0:
            dropout_input = (self.config["dropout_input"] *
                             tf.cast(self.is_training, tf.float32) +
                             (1.0 - tf.cast(self.is_training, tf.float32)))
            word_input_tensor = tf.nn.dropout(word_input_tensor,
                                              dropout_input,
                                              name="dropout_word")

        word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell(
            self.config["word_recurrent_size"],
            use_peepholes=self.config["lstm_use_peepholes"],
            state_is_tuple=True,
            initializer=self.initializer,
            reuse=False)
        word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell(
            self.config["word_recurrent_size"],
            use_peepholes=self.config["lstm_use_peepholes"],
            state_is_tuple=True,
            initializer=self.initializer,
            reuse=False)

        with tf.control_dependencies([
                tf.assert_equal(tf.shape(self.word_ids)[1],
                                tf.reduce_max(self.sentence_lengths),
                                message="Sentence dimensions don't match")
        ]):
            (lstm_outputs_fw, lstm_outputs_bw), ((_, lstm_output_fw), (_, lstm_output_bw)) = \
                tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=word_lstm_cell_fw, cell_bw=word_lstm_cell_bw, inputs=word_input_tensor,
                    sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False)

        lstm_output_states = tf.concat([lstm_output_fw, lstm_output_bw],
                                       axis=-1)

        if self.config["dropout_word_lstm"] > 0.0:
            dropout_word_lstm = (self.config["dropout_word_lstm"] *
                                 tf.cast(self.is_training, tf.float32) +
                                 (1.0 - tf.cast(self.is_training, tf.float32)))
            lstm_outputs_fw = tf.nn.dropout(
                lstm_outputs_fw,
                dropout_word_lstm,
                noise_shape=tf.convert_to_tensor([
                    tf.shape(self.word_ids)[0], 1,
                    self.config["word_recurrent_size"]
                ],
                                                 dtype=tf.int32))
            lstm_outputs_bw = tf.nn.dropout(
                lstm_outputs_bw,
                dropout_word_lstm,
                noise_shape=tf.convert_to_tensor([
                    tf.shape(self.word_ids)[0], 1,
                    self.config["word_recurrent_size"]
                ],
                                                 dtype=tf.int32))
            lstm_output_states = tf.nn.dropout(lstm_output_states,
                                               dropout_word_lstm)

        # The forward and backward states are concatenated at every token position.
        lstm_outputs_states = tf.concat([lstm_outputs_fw, lstm_outputs_bw],
                                        axis=-1)

        if self.config["whidden_layer_size"] > 0:
            lstm_outputs_states = tf.layers.dense(
                lstm_outputs_states,
                self.config["whidden_layer_size"],
                activation=tf.tanh,
                kernel_initializer=self.initializer)

        if self.config["model_type"] == "last":
            processed_tensor = lstm_output_states
            token_scores = tf.layers.dense(
                lstm_outputs_states,
                units=len(self.label2id_tok),
                kernel_initializer=self.initializer,
                name="token_scores_last_lstm_outputs_ff")
            if self.config["hidden_layer_size"] > 0:
                processed_tensor = tf.layers.dense(
                    processed_tensor,
                    units=self.config["hidden_layer_size"],
                    activation=tf.tanh,
                    kernel_initializer=self.initializer)
            sentence_scores = tf.layers.dense(
                processed_tensor,
                units=len(self.label2id_sent),
                kernel_initializer=self.initializer,
                name="sentence_scores_last_lstm_outputs_ff")
        else:
            with tf.variable_scope("attention"):
                token_scores_list = []
                sentence_scores_list = []

                for i in range(len(self.label2id_tok)):
                    keys = tf.layers.dense(
                        lstm_outputs_states,
                        units=self.config["attention_evidence_size"],
                        activation=tf.tanh,
                        kernel_initializer=self.initializer)
                    values = tf.layers.dense(
                        lstm_outputs_states,
                        units=self.config["attention_evidence_size"],
                        activation=tf.tanh,
                        kernel_initializer=self.initializer)

                    token_scores_head = tf.layers.dense(
                        keys, units=1,
                        kernel_initializer=self.initializer)  # [B, M, 1]
                    token_scores_head = tf.reshape(
                        token_scores_head,
                        shape=tf.shape(self.word_ids))  # [B, M]
                    token_scores_list.append(token_scores_head)

                    if self.config["attention_activation"] == "sharp":
                        attention_weights_unnormalized = tf.exp(
                            token_scores_head)
                    elif self.config["attention_activation"] == "soft":
                        attention_weights_unnormalized = tf.sigmoid(
                            token_scores_head)
                    elif self.config["attention_activation"] == "linear":
                        attention_weights_unnormalized = token_scores_head
                    else:
                        raise ValueError(
                            "Unknown/unsupported token scoring method: %s" %
                            self.config["attention_activation"])
                    attention_weights_unnormalized = tf.where(
                        tf.sequence_mask(self.sentence_lengths),
                        attention_weights_unnormalized,
                        tf.zeros_like(attention_weights_unnormalized))
                    attention_weights = attention_weights_unnormalized / tf.reduce_sum(
                        attention_weights_unnormalized, axis=1,
                        keep_dims=True)  # [B, M]
                    processed_tensor = tf.reduce_sum(
                        values * attention_weights[:, :, numpy.newaxis],
                        axis=1)  # [B, E]

                    if self.config["hidden_layer_size"] > 0:
                        processed_tensor = tf.layers.dense(
                            processed_tensor,
                            units=self.config["hidden_layer_size"],
                            activation=tf.tanh,
                            kernel_initializer=self.initializer)

                    sentence_score_head = tf.layers.dense(
                        processed_tensor,
                        units=1,
                        kernel_initializer=self.initializer,
                        name="output_ff_head_%d" % i)  # [B, 1]
                    sentence_score_head = tf.reshape(
                        sentence_score_head,
                        shape=[tf.shape(processed_tensor)[0]])  # [B]
                    sentence_scores_list.append(sentence_score_head)

                token_scores = tf.stack(token_scores_list,
                                        axis=-1)  # [B, M, H]
                all_sentence_scores = tf.stack(sentence_scores_list,
                                               axis=-1)  # [B, H]

                if len(self.label2id_tok) != len(self.label2id_sent):
                    if len(self.label2id_sent) == 2:
                        default_sentence_score = tf.gather(all_sentence_scores,
                                                           indices=[0],
                                                           axis=1)  # [B, 1]
                        maximum_non_default_sentence_score = tf.gather(
                            all_sentence_scores,
                            indices=list(range(1, len(self.label2id_tok))),
                            axis=1)  # [B, num_heads-1]
                        maximum_non_default_sentence_score = tf.reduce_max(
                            maximum_non_default_sentence_score,
                            axis=1,
                            keep_dims=True)  # [B, 1]
                        sentence_scores = tf.concat(
                            [
                                default_sentence_score,
                                maximum_non_default_sentence_score
                            ],
                            axis=-1,
                            name="sentence_scores_concatenation")  # [B, 2]
                    else:
                        sentence_scores = tf.layers.dense(
                            all_sentence_scores,
                            units=len(self.label2id_sent),
                            kernel_initializer=self.initializer
                        )  # [B, num_sent_labels]
                else:
                    sentence_scores = all_sentence_scores

        # Mask the token scores that do not fall in the range of the true sentence length.
        # Do this for each head (change shape from [B, M] to [B, M, num_heads]).
        tiled_sentence_lengths = tf.tile(
            input=tf.expand_dims(tf.sequence_mask(self.sentence_lengths),
                                 axis=-1),
            multiples=[1, 1, len(self.label2id_tok)])
        self.token_probabilities = tf.nn.softmax(token_scores, axis=-1)
        self.token_probabilities = tf.where(
            tiled_sentence_lengths, self.token_probabilities,
            tf.zeros_like(self.token_probabilities))
        self.token_predictions = tf.argmax(self.token_probabilities, axis=2)

        self.sentence_probabilities = tf.nn.softmax(sentence_scores)
        self.sentence_predictions = tf.argmax(self.sentence_probabilities,
                                              axis=1)

        if self.config["word_objective_weight"] > 0:
            word_objective_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=token_scores,
                labels=tf.cast(self.word_labels, tf.int32))
            word_objective_loss = tf.where(
                tf.sequence_mask(self.sentence_lengths), word_objective_loss,
                tf.zeros_like(word_objective_loss))
            self.loss += self.config["word_objective_weight"] * tf.reduce_sum(
                self.word_objective_weights * word_objective_loss)

        if self.config["sentence_objective_weight"] > 0:
            self.loss += self.config[
                "sentence_objective_weight"] * tf.reduce_sum(
                    self.sentence_objective_weights *
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=sentence_scores,
                        labels=tf.cast(self.sentence_labels, tf.int32)))

        max_over_token_heads = tf.reduce_max(self.token_probabilities,
                                             axis=1)  # [B, H]
        one_hot_sentence_labels = tf.one_hot(tf.cast(self.sentence_labels,
                                                     tf.int32),
                                             depth=len(self.label2id_sent))
        if self.config["enable_label_smoothing"]:
            one_hot_sentence_labels_smoothed = label_smoothing(
                one_hot_sentence_labels,
                epsilon=self.config["smoothing_epsilon"])
        else:
            one_hot_sentence_labels_smoothed = one_hot_sentence_labels

        # At least one token has a label corresponding to the true sentence label.
        # This loss also pushes the maximums over the other heads towards 0 (but smoothed).
        if self.config["type1_attention_objective_weight"] > 0:
            this_max_over_token_heads = max_over_token_heads
            if len(self.label2id_tok) != len(self.label2id_sent):
                if len(self.label2id_sent) == 2:
                    max_default_head = tf.gather(max_over_token_heads,
                                                 indices=[0],
                                                 axis=-1)  # [B, 1]
                    max_non_default_head = tf.reduce_max(
                        tf.gather(max_over_token_heads,
                                  indices=list(range(1,
                                                     len(self.label2id_tok))),
                                  axis=-1),
                        axis=1,
                        keep_dims=True)  # [B, 1]
                    this_max_over_token_heads = tf.concat(
                        [max_default_head, max_non_default_head],
                        axis=-1)  # [B, 2]
                else:
                    raise ValueError(
                        "Unsupported attention loss for num_heads != num_sent_lables "
                        "and num_sentence_labels != 2.")
            self.loss += self.config["type1_attention_objective_weight"] * (
                tf.reduce_sum(self.sentence_objective_weights * tf.reduce_sum(
                    tf.square(this_max_over_token_heads -
                              one_hot_sentence_labels_smoothed),
                    axis=-1)))

        # The predicted distribution over the token labels (heads) should be similar to the
        # predicted distribution over the sentence representations.
        if self.config["type2_attention_objective_weight"] > 0:
            all_sentence_scores_probabilities = tf.nn.softmax(
                all_sentence_scores)  # [B, H]
            self.loss += self.config["type2_attention_objective_weight"] * (
                tf.reduce_sum(self.sentence_objective_weights * tf.reduce_sum(
                    tf.square(max_over_token_heads -
                              all_sentence_scores_probabilities),
                    axis=-1)))

        # At least one token has a label corresponding to the true sentence label.
        if self.config["type3_attention_objective_weight"] > 0:
            this_max_over_token_heads = max_over_token_heads
            if len(self.label2id_tok) != len(self.label2id_sent):
                if len(self.label2id_sent) == 2:
                    max_default_head = tf.gather(max_over_token_heads,
                                                 indices=[0],
                                                 axis=-1)  # [B, 1]
                    max_non_default_head = tf.reduce_max(
                        tf.gather(max_over_token_heads,
                                  indices=list(range(1,
                                                     len(self.label2id_tok))),
                                  axis=-1),
                        axis=1,
                        keep_dims=True)  # [B, 1]
                    this_max_over_token_heads = tf.concat(
                        [max_default_head, max_non_default_head],
                        axis=-1)  # [B, 2]
                else:
                    raise ValueError(
                        "Unsupported attention loss for num_heads != num_sent_lables "
                        "and num_sentence_labels != 2.")
            self.loss += self.config["type3_attention_objective_weight"] * (
                tf.reduce_sum(
                    self.sentence_objective_weights * tf.reduce_sum(tf.square(
                        (this_max_over_token_heads * one_hot_sentence_labels) -
                        one_hot_sentence_labels_smoothed),
                                                                    axis=-1)))

        # A sentence that has a default label, should only contain tokens labeled as default.
        if self.config["type4_attention_objective_weight"] > 0:
            default_head = tf.gather(self.token_probabilities,
                                     indices=[0],
                                     axis=-1)  # [B, M, 1]
            default_head = tf.squeeze(default_head, axis=-1)  # [B, M]
            self.loss += self.config["type4_attention_objective_weight"] * (
                tf.reduce_sum(
                    self.sentence_objective_weights *
                    tf.cast(tf.equal(self.sentence_labels, 0.0), tf.float32) *
                    tf.reduce_sum(
                        tf.square(default_head - tf.ones_like(default_head)),
                        axis=-1)))

        # Every sentence has at least one default label.
        if self.config["type5_attention_objective_weight"] > 0:
            default_head = tf.gather(self.token_probabilities,
                                     indices=[0],
                                     axis=-1)  # [B, M, 1]
            max_default_head = tf.reduce_max(tf.squeeze(default_head, axis=-1),
                                             axis=-1)  # [B]
            self.loss += self.config["type5_attention_objective_weight"] * (
                tf.reduce_sum(self.sentence_objective_weights *
                              tf.square(max_default_head -
                                        tf.ones_like(max_default_head))))

        # Include a word-based language modelling loss, LMw.
        if self.config["lm_cost_lstm_gamma"] > 0.0:
            self.loss += self.config[
                "lm_cost_lstm_gamma"] * self.construct_lm_cost(
                    input_tensor_fw=lstm_outputs_fw,
                    input_tensor_bw=lstm_outputs_bw,
                    sentence_lengths=self.sentence_lengths,
                    target_ids=self.word_ids,
                    lm_cost_type="separate",
                    name="lm_cost_lstm_separate")

        if self.config["lm_cost_joint_lstm_gamma"] > 0.0:
            self.loss += self.config[
                "lm_cost_joint_lstm_gamma"] * self.construct_lm_cost(
                    input_tensor_fw=lstm_outputs_fw,
                    input_tensor_bw=lstm_outputs_bw,
                    sentence_lengths=self.sentence_lengths,
                    target_ids=self.word_ids,
                    lm_cost_type="joint",
                    name="lm_cost_lstm_joint")

        self.train_op = self.construct_optimizer(
            opt_strategy=self.config["opt_strategy"],
            loss=self.loss,
            learning_rate=self.learning_rate,
            clip=self.config["clip"])
        print("Notwork built.")