Exemple #1
0
    def _create_loss(self):
        print(
            'Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.'
        )
        start = time.time()

        def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                self.cell,
                num_encoder_symbols=config.ENC_VOCAB,
                num_decoder_symbols=config.DEC_VOCAB,
                embedding_size=config.HIDDEN_SIZE,
                output_projection=self.output_projection,
                feed_previous=do_decode)

        if self.fw_only:
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                self.targets,
                self.decoder_masks,
                config.BUCKETS,
                lambda x, y: _seq2seq_f(x, y, True),
                softmax_loss_function=self.softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if self.output_projection:
                for bucket in range(len(config.BUCKETS)):
                    self.outputs[bucket] = [
                        tf.matmul(output, self.output_projection[0]) +
                        self.output_projection[1]
                        for output in self.outputs[bucket]
                    ]
        else:
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                self.targets,
                self.decoder_masks,
                config.BUCKETS,
                lambda x, y: _seq2seq_f(x, y, False),
                softmax_loss_function=self.softmax_loss_function)
        print('Time:', time.time() - start)
    def build_model(self):
        # Training outputs and losses.
        self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
            self.encoder_inputs,
            self.decoder_inputs,
            self.targets,
            self.target_weights,
            self.buckets,
            lambda x, y: self.seq2seq_f(x, y, self.fwd_only),
            softmax_loss_function=self.softmax_loss_function
        )

        if self.fwd_only:
            # If we use output projection, we need to project outputs for decoding.
            if self.output_projection is not None:
                for b in xrange(len(self.buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, self.output_projection[0]) + self.output_projection[1]
                        for output in self.outputs[b]
                    ]

        # Gradients and SGD update operation for training the model.
        trainables = tf.trainable_variables()
        Seq2SeqModelTF.print_trainables(trainables)
        if not self.fwd_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.RMSPropOptimizer(self.lr)
            for b in xrange(len(self.buckets)):
                gradients = tf.gradients(self.losses[b], trainables)

                clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.mx_grad_nrm)

                self.gradient_norms.append(norm)

                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, trainables), global_step=self.global_step)
                )
Exemple #3
0
    def __init__(self, buckets, dataset, params):

        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger('ChatBotLogger')
        super(ChatBot, self).__init__(logger=logger,
                                      buckets=buckets,
                                      dataset=dataset,
                                      params=params)

        if len(buckets) > 1:
            self.log.error(
                "ChatBot requires len(buckets) be 1 since tensorflow's"
                " model_with_buckets function is now deprecated and BROKEN. The only"
                "workaround is ensuring len(buckets) == 1. ChatBot apologizes."
                "ChatBot also wishes it didn't have to be this way. "
                "ChatBot is jealous that DynamicBot does not have these issues."
            )
            raise ValueError(
                "Not allowed to pass buckets with len(buckets) > 1.")

        # ==========================================================================================
        # Define basic components: cell(s) state, encoder, decoder.
        # ==========================================================================================

        #cell =  tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(state_size)for _ in range(num_layers)])
        cell = tf.contrib.rnn.GRUCell(self.state_size)
        self.encoder_inputs = ChatBot._get_placeholder_list(
            "encoder", buckets[-1][0])
        self.decoder_inputs = ChatBot._get_placeholder_list(
            "decoder", buckets[-1][1] + 1)
        self.target_weights = ChatBot._get_placeholder_list(
            "weight", buckets[-1][1] + 1, tf.float32)
        target_outputs = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]

        # If specified, sample from subset of full vocabulary size during training.
        softmax_loss, output_proj = None, None
        if 0 < self.num_samples < self.vocab_size:
            softmax_loss, output_proj = ChatBot._sampled_loss(
                self.num_samples, self.state_size, self.vocab_size)

        # ==========================================================================================
        # Combine the components to construct desired model architecture.
        # ==========================================================================================

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs):
            # Note: the returned function uses separate embeddings for encoded/decoded sets.
            #           Maybe try implementing same embedding for both.
            # Question: the outputs are projected to vocab_size NO MATTER WHAT.
            #           i.e. if output_proj is None, it uses its own OutputProjectionWrapper instead
            #           --> How does this affect our model?? A bit misleading imo.
            #with tf.variable_scope(scope or "seq2seq2_f") as seq_scope:
            return embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=self.vocab_size,
                num_decoder_symbols=self.vocab_size,
                embedding_size=self.state_size,
                output_projection=output_proj,
                feed_previous=self.is_chatting,
                dtype=tf.float32)

        # Note that self.outputs and self.losses are lists of length len(buckets).
        # This allows us to identify which outputs/losses to compute given a particular bucket.
        # Furthermore, \forall i < j, len(self.outputs[i])  < len(self.outputs[j]). (same for loss)
        self.outputs, self.losses = model_with_buckets(
            self.encoder_inputs,
            self.decoder_inputs,
            target_outputs,
            self.target_weights,
            buckets,
            seq2seq_f,
            softmax_loss_function=softmax_loss)

        # If decoding, append _projection to true output to the model.
        if self.is_chatting and output_proj is not None:
            self.outputs = ChatBot._get_projections(len(buckets), self.outputs,
                                                    output_proj)

        with tf.variable_scope("summaries"):
            self.summaries = {}
            for i, loss in enumerate(self.losses):
                name = "loss{}".format(i)
                self.summaries[name] = tf.summary.scalar(
                    "loss{}".format(i), loss)
    def __init__(self, buckets, dataset, params):

        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger('ChatBotLogger')
        super(ChatBot, self).__init__(
            logger=logger,
            buckets=buckets,
            dataset=dataset,
            params=params)

        if len(buckets) > 1:
            self.log.error("ChatBot requires len(buckets) be 1 since tensorflow's"
                           " model_with_buckets function is now deprecated and BROKEN. The only"
                           "workaround is ensuring len(buckets) == 1. ChatBot apologizes."
                           "ChatBot also wishes it didn't have to be this way. "
                           "ChatBot is jealous that DynamicBot does not have these issues.")
            raise ValueError("Not allowed to pass buckets with len(buckets) > 1.")

        # ==========================================================================================
        # Define basic components: cell(s) state, encoder, decoder.
        # ==========================================================================================

        #cell =  tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.GRUCell(state_size)for _ in range(num_layers)])
        cell = tf.contrib.rnn.GRUCell(self.state_size)
        self.encoder_inputs = ChatBot._get_placeholder_list("encoder", buckets[-1][0])
        self.decoder_inputs = ChatBot._get_placeholder_list("decoder", buckets[-1][1] + 1)
        self.target_weights = ChatBot._get_placeholder_list("weight", buckets[-1][1] + 1, tf.float32)
        target_outputs = [self.decoder_inputs[i + 1] for i in range(len(self.decoder_inputs) - 1)]

        # If specified, sample from subset of full vocabulary size during training.
        softmax_loss, output_proj = None, None
        if 0 < self.num_samples < self.vocab_size:
            softmax_loss, output_proj = ChatBot._sampled_loss(self.num_samples,
                                                              self.state_size,
                                                              self.vocab_size)

        # ==========================================================================================
        # Combine the components to construct desired model architecture.
        # ==========================================================================================

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs):
            # Note: the returned function uses separate embeddings for encoded/decoded sets.
            #           Maybe try implementing same embedding for both.
            # Question: the outputs are projected to vocab_size NO MATTER WHAT.
            #           i.e. if output_proj is None, it uses its own OutputProjectionWrapper instead
            #           --> How does this affect our model?? A bit misleading imo.
            #with tf.variable_scope(scope or "seq2seq2_f") as seq_scope:
            return embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell,
                                               num_encoder_symbols=self.vocab_size,
                                               num_decoder_symbols=self.vocab_size,
                                               embedding_size=self.state_size,
                                               output_projection=output_proj,
                                               feed_previous=self.is_chatting,
                                               dtype=tf.float32)

        # Note that self.outputs and self.losses are lists of length len(buckets).
        # This allows us to identify which outputs/losses to compute given a particular bucket.
        # Furthermore, \forall i < j, len(self.outputs[i])  < len(self.outputs[j]). (same for loss)
        self.outputs, self.losses = model_with_buckets(
            self.encoder_inputs, self.decoder_inputs,
            target_outputs, self.target_weights,
            buckets, seq2seq_f,
            softmax_loss_function=softmax_loss)

        # If decoding, append _projection to true output to the model.
        if self.is_chatting and output_proj is not None:
            self.outputs = ChatBot._get_projections(len(buckets), self.outputs, output_proj)

        with tf.variable_scope("summaries"):
            self.summaries = {}
            for i, loss in enumerate(self.losses):
                name = "loss{}".format(i)
                self.summaries[name] = tf.summary.scalar("loss{}".format(i), loss)
Exemple #5
0
    def setup_model(self):
        # sampled_softmax_loss function
        output_projection = None
        softmax_loss_function = None
        if num_samples < vocabulary_size:
            # w = tf.get_variable('proj_w', [hidden_size, vocabulary_size])
            w = tf.Variable(
                tf.truncated_normal([hidden_size, vocabulary_size], -0.1, 0.1))
            w_t = tf.transpose(w)
            # b = tf.get_variable('proj_b', [vocabulary_size])
            b = tf.Variable(tf.zeros([vocabulary_size]))
            output_projection = (w, b)

            def sampled_loss(labels, logits):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(weights=w_t,
                                                  biases=b,
                                                  labels=labels,
                                                  inputs=logits,
                                                  num_sampled=num_samples,
                                                  num_classes=vocabulary_size)

            softmax_loss_function = sampled_loss

        # multi-layer rnn cell
        # cell = rnn.BasicLSTMCell(hidden_size)
        cell = rnn.GRUCell(hidden_size)
        if num_layers > 1:
            cell = rnn.MultiRNNCell([cell] * num_layers)

        # feeds
        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # seq2seq model structure
        def seq2seq_function(encoder_inputs, decoder_inputs, do_decode):
            return legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=vocabulary_size,
                num_decoder_symbols=vocabulary_size,
                embedding_size=hidden_size,
                output_projection=output_projection,
                feed_previous=do_decode)

        with tf.variable_scope("reusable_model"):
            self.outputs_train, self.losses_train = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_function(x, y, False),
                softmax_loss_function=softmax_loss_function)

        with tf.variable_scope("reusable_model", reuse=True):
            self.outputs_feedpre, self.losses_feedpre = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_function(x, y, True),
                softmax_loss_function=softmax_loss_function)
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs_feedpre[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1]
                        for output in self.outputs_feedpre[b]
                    ]

        # train op
        params = tf.trainable_variables()
        for b in xrange(len(buckets)):
            gradients = tf.gradients(self.losses_train[b], params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.train_ops.append(
                tf.train.GradientDescentOptimizer(
                    self.learning_rate).apply_gradients(
                        zip(clipped_gradients, params)))

        # saver
        self.saver = tf.train.Saver(max_to_keep=1)
Exemple #6
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        output_projection = None
        softmax_loss_function = None

        if num_samples > 0 and num_samples < self.target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.target_vocab_size)

            softmax_loss_function = sampled_loss

        #single_cell = rnn_cell.GRUCell(size)
        single_cell = rnn.GRUCell(size)
        if use_lstm:
            #single_cell = rnn_cell.BasicLSTMCell(size)
            single_cell = rnn.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            #cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
            cell = rnn.MultiRNNCell([single_cell] * num_layers)

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            #return seq2seq.embedding_attention_seq2seq(
            return legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                source_vocab_size,
                target_vocab_size,
                256,
                output_projection=output_projection,
                feed_previous=do_decode)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        if forward_only:
            #self.outputs, self.losses = seq2seq.model_with_buckets(
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                #self.target_weights, buckets, self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)

            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            #self.outputs, self.losses = seq2seq.model_with_buckets(
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                #self.target_weights, buckets, self.target_vocab_size,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
    def __init__(
            self,
            source_vocab_size,  # source源词典词汇数目大小
            target_vocab_size,  # target目标词典词汇数目大小
            buckets,  # 桶的大小
            size,  # LSTM每层神经元数量, 也就是LSTM输出的维度大小
            dropout,  # dropout保留率
            num_layers,  # 网络层数
            max_gradient_norm,  # 梯度最大阈值
            batch_size,  # 批次大小
            learning_rate,  # 学习率
            num_samples,  # 负采样的样本数目
            forward_only=False,  # 是否只有前向,也就是是否进行训练
            dtype=tf.float32):
        # init member variales
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        # LSTM cells(Multi RNN Cell, num_layers)
        # 定义多层 lstm cell细胞
        cell = rnn.BasicLSTMCell(size)
        cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout)
        cell = rnn.MultiRNNCell([cell] * num_layers)
        # 定义一个字典,默认value为list的字典
        self.bucket_to_summary_list = defaultdict(list)

        # 设定 输出映射
        output_projection = None
        # 设定 交叉熵损失函数,采用负采样损失函数
        softmax_loss_function = None

        # 如果vocabulary太大,我们还是按照vocabulary来sample的话,内存会爆
        if num_samples > 0 and num_samples < self.target_vocab_size:
            print('开启投影:{}'.format(num_samples))
            # 投影,字符数,负采样的数
            w_t = tf.get_variable("proj_w",
                                  dtype=dtype,
                                  shape=[self.target_vocab_size, size])
            # 进行转制操作
            w = tf.transpose(w_t)
            # 设置 偏置项 b
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)

            # 预测过程中,设置投影,由小变大
            output_projection = (w, b)  # 仅在预测过程中使用,训练过程中不使用

            # 损失函数
            def sampled_loss(labels, logits):
                labels = tf.reshape(labels, [-1, 1])
                # 因为选项有选fp16的训练,这里全部转换为fp32
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(logits, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(
                        weights=local_w_t,
                        biases=local_b,
                        labels=labels,
                        inputs=local_inputs,  # logits
                        num_sampled=num_samples,
                        num_classes=self.target_vocab_size),
                    dtype)

            softmax_loss_function = sampled_loss

        # seq2seq_f seq前向操作
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            print("当前桶的Seq2Seq模型构建.....")
            # encoder 先将cell及逆行deepcopy 因为seq2seq模型是两个相同的模型(encoder和decoder),但是模型参数不共享,所以encoder和decoder要使用两个不同的RNNcell
            tmp_cell = copy.deepcopy(cell)

            # cell:RNNCell常见的一些RNNCell定义都可以用
            # num_encoder_symbols:source的vocab_size大小,用于embedding矩阵定义
            # num_decoder_symbols:source的vocab_size大小,用于embedding矩阵定义
            # embedding_size:embedding向量的维度
            # num_heads:Attention头的个数,就是使用多少中attention的加权方式,用更多的参数来求出集中attention向量
            # output_projection:输出的映射层,因为decoder输出的维度是output_size,所以想要得到num_decoder_symbols对应的词还需要增加一个映射层, 仅用于预测过程
            # feed_previous:是否将上一时刻输出作为下一时刻输入,一般测试的时候设置为True,此时decoder_inputs除了第一个元素之外其他元素都不会使用, 仅用于预测过程

            return legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                tmp_cell,  # 自定义的cell,可以是GRU/LSTM,设置multiayer等
                num_encoder_symbols=source_vocab_size,  # 词典大小
                num_decoder_symbols=target_vocab_size,  # 目标词典大小
                embedding_size=size,  # embedding维度
                output_projection=
                output_projection,  # 不设定的化输出的维度可能很大(取决于此表大小),设定的话投射到一个低维向量
                feed_previous=do_decode,
                dtype=dtype)

        print("开始构建模型输入占位符.....")
        # inputs
        self.encoder_inputs = []  # 编码器输入
        self.decoder_inputs = []  # 解码器输入
        self.decoder_weights = []  # Loss损失函数计算的权重系数
        # encoder_inputs 这个列表对象中的每一个元素表示一个占位符,起名字分别为enconder0,encoder1....encoder{i}的几何意义是编码器再时刻i的输入
        # buckets中的最后一个是最大的(即第“-1”个)
        for i in range(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='encoder_input_{}'.format(i)))
        # 输出比输入大 1,这是为了保证下面的targets可以向左shift 1位<空出一个结束符号>
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='decoder_input_{}'.format(i)))
            self.decoder_weights.append(
                tf.placeholder(dtype,
                               shape=[None],
                               name='decoder_weight_{}'.format(i)))
        targets = [self.decoder_inputs[i + 1] for i in range(buckets[-1][1])]

        print("开始构建模型....")
        # 跟language model类似,targets变量是decoder inputs 平移一个单位的结果,
        # encoder Inputs :encoder的输入,一个tensor的列表,列表中每一项都是encoder时的一个词(batch)
        # decoder_inpits :decoder的输入,同上
        # targets :目标值,与decoder_inputs只相差一个<eos>符号,int32型
        # buckets :就是定义的bucket的值(编码器数据长度,解码器数据长度),是一个列表
        # seq2seq:定义好的seq2seq模型,可以使用后面介绍的embedding_attention_seq2seq,embedding_rnn_seq2seq,basic_rnn_seq2等
        # softmax_loss_fuction:计算误差的函数(labels,logits)默认为sqarse_softmax_cross_entroy_with_logits
        # per_example_loss:如果为真,则调用sequence_loss_by_example,返回一个列表,其每个元素就是一个样本的loss值,
        # 如果为假,则调用sequence_loss函数,对一个
        if forward_only:  # 测试阶段
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,  # 编码器输入
                self.decoder_inputs,  # 解码器输入
                targets,  # 实际值, 仅在loss损失函数构建的时候使用
                self.decoder_weights,  # 解码器权重
                buckets,  # 盒子
                lambda x, y: seq2seq_f(x, y, True),  # seq操作
                softmax_loss_function=softmax_loss_function  # 损失函数
            )
            if output_projection is not None:
                for b in range(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            # 训练阶段
            # 将输入长度分成不同的间隔,这样数据在填充时只需要填充到相应的bucket长度即可,不需要都填充到最大长度
            # 比如buckets取[(5,10),(10,20),(20,30)...](每个bucket的第一个数字表示source填充的长度)
            # 第二个数字表示target填充的长度,eg:'我爱你'->'I love you'。应该会被分配到第一个bucket中
            # 然后'我爱你'会被pad成长度为5的序列,'I love you'会被pad成长度为10的序列,其实就是每个bucket表示一个模型的参数配置
            # 这样对每个bucket都构造一个模型,然后训练时取相应长度的序列进行,而这样模型将会共享参数
            # 其实这一部分可以参考现在的dynamic_rnn来及逆行理解,dynamic_rnn是对每个batch的数据讲起pad至本batch中长度最大的样本
            # 而bucket则时在数据预处理环节先对数据长度进行聚类操作,明白其原理之后我们来看一下这个函数的参数和内容实现

            # 跟language model类似,targets变量是decoder inputs 平移一个单位的结果,
            # encoder Inputs :encoder的输入,一个tensor的列表,列表中每一项都是encoder时的一个词(batch)
            # decoder_inpits :decoder的输入,同上
            # targets :目标值,与decoder_inputs只相差一个<eos>符号,int32型
            # buckets :就是定义的bucket的值(编码器数据长度,解码器数据长度),是一个列表
            # seq2seq:定义好的seq2seq模型,可以使用后面介绍的embedding_attention_seq2seq,embedding_rnn_seq2seq,basic_rnn_seq2等
            # softmax_loss_fuction:计算误差的函数(labels,logits)默认为sqarse_softmax_cross_entroy_with_logits
            # per_example_loss:如果为真,则调用sequence_loss_by_example,返回一个列表,其每个元素就是一个样本的loss值,
            # 如果为假,则调用sequence_loss函数,对一个元素计算loss
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,  # 编码器输入
                self.decoder_inputs,  # 解码器输入
                targets,  # 实际值, 仅在loss损失函数构建的时候使用
                self.decoder_weights,  # 解码器权重
                buckets,  # 盒子
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function  # 损失函数
            )
            # 每个桶分别设置loss的可视化
            for b_idx in range(len(buckets)):
                bucket_loss_scalar = tf.summary.scalar('loss_{}'.format(b_idx),
                                                       self.losses[b_idx])
                self.bucket_to_summary_list[b_idx].append(bucket_loss_scalar)

        if not forward_only:  # 只有训练阶段才需要计算梯度和参数更新
            print("开始构建优化器对象....")
            # 获取所有训练参数
            params = tf.trainable_variables()

            # 定义优化器
            opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.gradient_norms = []
            self.updates = []
            for output, loss in zip(self.outputs,
                                    self.losses):  # 获取得到每个桶的输出和损失函数的值
                # 基于给定的损失函数以及参数列表,计算参数列表对应的梯度值
                gradients = tf.gradients(loss, params)
                # 基于给定的最大梯度值(max_gradient_norm, 求参数的梯度值进行一个截断操作)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)

                # 添加结果数据(全局norm以及参数更新操作)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params)))

        # 定义模型持久化的对象
        self.saver = tf.train.Saver(tf.global_variables(),
                                    write_version=tf.train.SaverDef.V2)
Exemple #8
0
    def __init__(self, use_lstm=False, num_samples=512, forward_only=False):
        self.source_vocab_size = config.vocabulary_size
        self.target_vocab_size = config.vocabulary_size
        self.buckets = config.BUCKETS
        self.batch_size = config.FLAGS.batch_size
        self.learning_rate = tf.Variable(float(config.FLAGS.learning_rate),
                                         trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * config.FLAGS.learning_rate_decay_factor)
        self.lsmt_size = config.FLAGS.lstm_size
        self.num_layers = config.FLAGS.num_layers
        self.dropout = config.FLAGS.dropout
        self.max_gradient_norm = config.FLAGS.max_gradient_norm
        self.global_step = tf.Variable(0, trainable=False)
        self.model_dir = config.model_dir

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None

        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable('proj_w',
                                [self.lsmt_size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable('proj_b', [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(labels, logits):
                labels = tf.reshape(
                    labels,
                    [-1, 1])  # Add one dimension (nb of true classes, here 1)

                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                localWt = tf.cast(w_t, tf.float32)
                localB = tf.cast(b, tf.float32)
                localInputs = tf.cast(logits, tf.float32)

                return tf.cast(
                    tf.nn.sampled_softmax_loss(
                        localWt,  # Should have shape [num_classes, dim]
                        localB,
                        labels,
                        localInputs,
                        num_samples,  # The number of classes to randomly sample per batch
                        self.target_vocab_size),  # The number of classes
                    tf.float32)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_call = rnn.GRUCell(self.lsmt_size)
        if use_lstm:
            single_call = rnn.BasicLSTMCell(self.lsmt_size)

        if not forward_only:
            single_call = rnn.DropoutWrapper(single_call,
                                             input_keep_prob=1.0,
                                             output_keep_prob=self.dropout)

        cell = single_call
        if self.num_layers > 1:
            cell = rnn.MultiRNNCell([single_call] * self.num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            import copy
            temp_cell = copy.deepcopy(cell)
            return legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                temp_cell,
                num_encoder_symbols=self.source_vocab_size,
                num_decoder_symbols=self.target_vocab_size,
                embedding_size=self.lsmt_size,
                output_projection=output_projection,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(self.buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))

        for i in range(self.buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)

            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in range(len(self.buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            params = tf.trainable_variables()
            for b in range(len(self.buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=3)
        self.mergedSummaries = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(config.graph_dir)
Exemple #9
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if 0 < num_samples < self.target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable(name='projection_w',
                                    shape=[size, self.target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable(name='projection_b',
                                    shape=[self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(logits, labels):
                with tf.device("/cpu:0"):
                    # labels.shape = [batch_size], need to transform to [batch_size, num_true]
                    # where `num_true=1` (the number of target classes per training example).
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(
                        weights=w_t,
                        biases=b,
                        labels=labels,
                        inputs=logits,
                        num_sampled=num_samples,
                        num_classes=self.target_vocab_size,
                        num_true=1)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        # This is the generator of seq2seq models for different buckets.
        def seq2seq_f(encoder_inputs, decoder_inputs, feed_previous):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs=encoder_inputs,
                decoder_inputs=decoder_inputs,
                cell=cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=size,
                output_projection=output_projection,
                feed_previous=feed_previous)

        # Placeholders for all inputs: encoder, decoder, weights (to suppress loss for cells that receive the padding).
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                seq2seq=lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                seq2seq=lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
Exemple #10
0
    def build(self):
        """
        Build the model
        :return:
        """
        cprint("[*] Building model (G)", color="yellow")
        cell = tf.contrib.rnn.GRUCell(self.cfg.hidden_size)
        if self.cfg.num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell([cell] * self.cfg.num_layers)

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_rnn_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=self.vocab_size_encoder,
                num_decoder_symbols=self.vocab_size_decoder,
                output_projection=self.output_projection,
                embedding_size=self.cfg.embedding_size,
                feed_previous=do_decode)

        with tf.variable_scope("seq2seq") as _:
            model_infos = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                self.targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_f(x, y, tf.logical_not(self.is_training)),
                softmax_loss_function=self.softmax_loss_function)

            self.outputs = model_infos[0][0]
            self.losses = model_infos[1][0]

        # Optimization :
        train_vars = tf.trainable_variables()

        # TODO: try Adam optimizer
        opt = tf.train.GradientDescentOptimizer(self.cfg.learning_rate)
        grads = tf.gradients(self.losses, train_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.max_gradient_norm)
        grad_variances, fisher, sticky_weights = [], [], []
        update_grad_variances, update_fisher, replace_fisher, update_sticky_weights, restore_sticky_weights = \
            [], [], [], [], []
        ewc_losses = []
        for i, (g, v) in enumerate(zip(grads, train_vars)):
            print(g, v)
            with tf.variable_scope("grad_variance"):
                grad_variances.append(
                    tf.get_variable("gv_{}".format(v.name.replace(":", "_")),
                                    v.get_shape().as_list(),
                                    dtype=tf.float32,
                                    trainable=False,
                                    initializer=tf.zeros_initializer()))
                fisher.append(
                    tf.get_variable("fisher_{}".format(v.name.replace(
                        ":", "_")),
                                    v.get_shape().as_list(),
                                    dtype=tf.float32,
                                    trainable=False,
                                    initializer=tf.zeros_initializer()))
            with tf.variable_scope("sticky_weights"):
                sticky_weights.append(
                    tf.get_variable("sticky_{}".format(v.name.replace(
                        ":", "_")),
                                    v.get_shape().as_list(),
                                    dtype=tf.float32,
                                    trainable=False,
                                    initializer=tf.zeros_initializer()))
            update_grad_variances.append(
                tf.assign(
                    grad_variances[i], self.beta * grad_variances[i] +
                    (1 - self.beta) * g * g * self.batch_size))
            update_fisher.append(
                tf.assign(fisher[i], fisher[i] + grad_variances[i]))
            replace_fisher.append(tf.assign(fisher[i], grad_variances[i]))
            update_sticky_weights.append(tf.assign(sticky_weights[i], v))
            restore_sticky_weights.append(tf.assign(v, sticky_weights[i]))
            ewc_losses.append(
                tf.reduce_sum(tf.square(v - sticky_weights[i]) * fisher[i]))

        ewc_loss = self.losses + self.ewc_loss_coef * .5 * tf.add_n(ewc_losses)
        grads_ewc = tf.gradients(ewc_loss, train_vars)
        grads_ewc, _ = tf.clip_by_global_norm(grads_ewc,
                                              self.max_gradient_norm)

        self.sticky_weights = sticky_weights
        self.grad_variances = grad_variances

        with tf.control_dependencies(update_grad_variances):
            self.update_grad_variances = tf.no_op('update_grad_variances')

        with tf.control_dependencies(update_grad_variances):
            self.updates = tf.cond(
                tf.equal(self.ewc_loss_coef, tf.constant(0.)),
                lambda: opt.apply_gradients(zip(grads, train_vars),
                                            global_step=self.global_step),
                lambda: opt.apply_gradients(zip(grads_ewc, train_vars),
                                            global_step=self.global_step))

        with tf.control_dependencies(update_fisher):
            self.update_fisher = tf.no_op('update_fisher')
        with tf.control_dependencies(replace_fisher):
            self.replace_fisher = tf.no_op('replace_fisher')
        with tf.control_dependencies(update_sticky_weights):
            self.update_sticky_weights = tf.no_op('update_sticky_weights')
        with tf.control_dependencies(restore_sticky_weights):
            self.restore_sticky_weights = tf.no_op('restore_sticky_weights')
        cprint("[!] Model built", color="green")