Ejemplo n.º 1
0
    def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=False, max_gradient_norm=5.0):

        self.decay = decay
        print 'Training preparation...'

        print 'Defining loss...'
        loss = []
        if self.crf > 0:
            loss_function = losses.crf_loss
            for i in range(len(self.input_v)):
                bucket_loss = losses.loss_wrapper(self.output[i], self.output_[i], loss_function,
                                                  transitions=self.transition_char, nums_tags=self.nums_tags,
                                                  batch_size=self.real_batches[i])
                loss.append(bucket_loss)
        else:
            loss_function = losses.sparse_cross_entropy
            for output, output_ in zip(self.output, self.output_):
                bucket_loss = losses.loss_wrapper(output, output_, loss_function)
                loss.append(bucket_loss)

        l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder')
        self.l_rate = l_rate

        if optimizer == 'sgd':
            if momentum is None:
                optimizer = tf.train.GradientDescentOptimizer(learning_rate=l_rate)
            else:
                optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum)
        elif optimizer == 'adagrad':
            assert lr_v is not None
            optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate)
        elif optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=l_rate)
        else:
            raise Exception('optimiser error')

        self.train_step = []

        print 'Computing gradients...'

        for idx, l in enumerate(loss):
            t2 = time()
            if clipping:
                gradients = tf.gradients(l, self.params)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
                train_step = optimizer.apply_gradients(zip(clipped_gradients, self.params))
            else:
                train_step = optimizer.minimize(l)
            print 'Bucket %d, %f seconds' % (idx + 1, time() - t2)
            self.train_step.append(train_step)
Ejemplo n.º 2
0
    def config(self,
               optimizer,
               decay,
               lr_v=None,
               momentum=None,
               clipping=True,
               max_gradient_norm=5.0):

        self.decay = decay
        print 'Training preparation...'

        print 'Defining loss...'
        loss = []
        if self.crf > 0:
            loss_function = losses.crf_loss
            for i in range(len(self.input_v1)):
                bucket_loss = []
                #pdb.set_trace()
                for j in range(self.num_gpus):
                    #pdb.set_trace()
                    with tf.device('/gpu:{}'.format(j)):
                        output_ = self.output_[i][0][j *
                                                     self.real_batches[i]:(j +
                                                                           1) *
                                                     self.real_batches[i]]
                        gpu_loss = losses.loss_wrapper(
                            [self.output[i][0][j]], [output_],
                            loss_function,
                            transitions=[self.transition_char],
                            nums_tags=[self.nums_tags],
                            batch_size=self.real_batches[i])
                        bucket_loss.append(gpu_loss)
                loss.append(bucket_loss)
        #else:
        #	loss_function = losses.sparse_cross_entropy
        #	for output, output_ in zip(self.output, self.output_):
        #		bucket_loss = losses.loss_wrapper(output, output_, loss_function)
        #		loss.append(bucket_loss)

        l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder')
        self.l_rate = l_rate

        if optimizer == 'sgd':
            if momentum is None:
                optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=l_rate)
            else:
                optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate,
                                                       momentum=momentum)
        elif optimizer == 'adagrad':
            assert lr_v is not None
            optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate)
        elif optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer()
        else:
            raise Exception('optimiser error')

        self.train_step = []

        print 'Computing gradients...'

        for idx, l in enumerate(loss):
            t2 = time()
            tower_grads = []
            for i in range(self.num_gpus):
                with tf.device('/gpu:{}'.format(i)):
                    if clipping:
                        gradients = tf.gradients(l[i], self.params)
                        tower_grads.append(gradients)
            #pdb.set_trace()
            grads = self.average_gradients(tower_grads)
            pdb.set_trace()
            clipped_gradients, norm = tf.clip_by_global_norm(
                grads, max_gradient_norm)
            train_step = optimizer.apply_gradients(
                list(zip(clipped_gradients, self.params)))
            #else:
            #	train_step = optimizer.minimize(l)
            print 'Bucket %d, %f seconds' % (idx + 1, time() - t2)
            self.train_step.append(train_step)
Ejemplo n.º 3
0
    def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=False, max_gradient_norm=5.0):

        """

        :param optimizer: 优化函数,Adagrad
        :param decay: 学习率衰减率,0.05
        :param lr_v:  学习率,0.1
        :param momentum:
        :param clipping: 是否运用梯度裁剪(给梯度设置最大阈值)
        :param max_gradient_norm:
        """
        self.decay = decay
        print 'Training preparation...'

        print 'Defining loss...'
        if self.crf > 0:
            loss_function = losses.crf_loss

            for i in range(len(self.input_v)):
                # 根据第 i 个 bucket 的输出和 ground truth,用 CRF 损失函数,计算损失函数值
                tagging_loss, lm_loss = losses.loss_wrapper(self.output[i], self.output_[i], self.lm_predictions[i],
                                                            self.lm_groundtruthes[i], loss_function,
                                                            transitions=self.transition_char, nums_tags=self.nums_tags,
                                                            batch_size=self.real_batches[i])
                tagging_loss_summary = tf.summary.scalar('tagging loss %s' % i, tf.reduce_mean(tagging_loss))
                lm_loss_summary = tf.summary.scalar('language model loss %s' % i, tf.reduce_mean(lm_loss))
                self.losses.append(tagging_loss + lm_loss)
                self.summaries.append([tagging_loss_summary, lm_loss_summary])

        else:
            loss_function = losses.sparse_cross_entropy
            for output, output_ in zip(self.output, self.output_):
                bucket_loss = losses.loss_wrapper(output, output_, loss_function)
                self.losses.append(bucket_loss)

        l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder')
        self.l_rate = l_rate

        if optimizer == 'sgd':
            if momentum is None:
                optimizer = tf.train.GradientDescentOptimizer(learning_rate=l_rate)
            else:
                optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum)
        elif optimizer == 'adagrad':
            assert lr_v is not None
            optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate)
        elif optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=l_rate)
        else:
            raise Exception('optimiser error')

        self.train_steps = []

        print 'Computing gradients...'

        for idx, l in enumerate(self.losses):
            t2 = time()
            if clipping:
                gradients = tf.gradients(l, self.params)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
                train_step = optimizer.apply_gradients(zip(clipped_gradients, self.params))
            else:
                train_step = optimizer.minimize(l)
            print 'Bucket %d, %f seconds' % (idx + 1, time() - t2)
            self.train_steps.append(train_step)

        self.merged_summary = tf.summary.merge_all()
    def config(self,
               optimizer,
               decay,
               lr_v=None,
               momentum=None,
               clipping=False,
               max_gradient_norm=5.0):
        """

        :param optimizer: 优化函数,Adagrad
        :param decay: 学习率衰减率,0.05
        :param lr_v:  学习率,0.1
        :param momentum:
        :param clipping: 是否运用梯度裁剪(给梯度设置最大阈值)
        :param max_gradient_norm:
        """
        self.decay = decay
        print 'Training preparation...'

        print 'Defining loss...'

        if self.crf > 0:
            for i in range(len(self.input_v)):
                # 根据第 i 个 bucket 的输出和 ground truth,用 CRF 损失函数,计算损失函数值
                tagging_loss = losses.loss_wrapper(
                    self.output[i],
                    self.output_[i],
                    losses.crf_loss,
                    transitions=self.transition_char,
                    nums_tags=self.nums_tags,
                    batch_size=self.real_batches[i])
                tagging_loss_summary = tf.summary.scalar(
                    'tagging loss %s' % i, tf.reduce_mean(tagging_loss))

                loss = tagging_loss
                loss_summary = [tagging_loss_summary]

                if self.co_train:
                    lm_loss = []
                    masks = tf.reshape(tf.cast(tf.sign(self.output_[i]),
                                               dtype=tf.float32),
                                       shape=[-1, self.buckets_char[i]])
                    for lm_fw_y, lm_fw_y_, lm_bw_y, lm_bw_y_ in zip(
                            self.lm_fw_predictions[i],
                            self.lm_fw_groundtruthes[i],
                            self.lm_bw_predictions[i],
                            self.lm_bw_groundtruthes[i]):
                        lm_fw_loss = tf.contrib.seq2seq.sequence_loss(
                            lm_fw_y, lm_fw_y_, masks)
                        lm_bw_loss = tf.contrib.seq2seq.sequence_loss(
                            lm_bw_y, lm_bw_y_, masks)
                        # lm_fw_loss = tf.reduce_sum(losses.sparse_cross_entropy(lm_fw_y, lm_fw_y_) * masks)
                        # lm_bw_loss = tf.reduce_sum(losses.sparse_cross_entropy(lm_bw_y, lm_bw_y_) * masks)
                        lm_loss.append(lm_fw_loss + lm_bw_loss)
                    lm_loss = tf.stack(lm_loss)
                    lm_loss_summary = tf.summary.scalar(
                        'language model loss %s' % i, tf.reduce_mean(lm_loss))

                    loss += self.lambda0 * lm_loss
                    loss_summary.append(lm_loss_summary)
                if self.char_freq_loss:
                    freq_loss = []
                    masks = tf.cast(tf.sign(self.output_[i]), dtype=tf.float32)
                    for freq_y, freq_y_ in zip(
                            self.char_freq_predictions[i],
                            self.char_freq_groundtruthes[i]):
                        freq_loss.append(
                            tf.losses.mean_squared_error(
                                freq_y_,
                                tf.reshape(freq_y, tf.shape(freq_y_)),
                                weights=tf.reshape(masks, tf.shape(freq_y_))))
                    freq_loss = tf.stack(freq_loss)
                    freq_loss_summary = tf.summary.scalar(
                        'char freq loss %s' % i, tf.reduce_mean(freq_loss))
                    loss += self.lambda1 * freq_loss
                    loss_summary.append(freq_loss_summary)
                self.losses.append(loss)
                self.summaries.append(loss_summary)

        else:
            # todo
            loss_function = losses.sparse_cross_entropy
            for output, output_ in zip(self.output, self.output_):
                bucket_loss = losses.loss_wrapper(output, output_,
                                                  loss_function)
                self.losses.append(bucket_loss)

        l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder')
        self.l_rate = l_rate

        if optimizer == 'sgd':
            if momentum is None:
                optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=l_rate)
            else:
                optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate,
                                                       momentum=momentum)
        elif optimizer == 'adagrad':
            assert lr_v is not None
            optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate)
        elif optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=l_rate)
        else:
            raise Exception('optimiser error')

        self.train_steps = []

        print 'Computing gradients...'

        for idx, l in enumerate(self.losses):
            t2 = time()
            if clipping:
                gradients = tf.gradients(l, self.params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                train_step = optimizer.apply_gradients(
                    zip(clipped_gradients, self.params))
            else:
                train_step = optimizer.minimize(l)
            print 'Bucket %d, %f seconds' % (idx + 1, time() - t2)
            self.train_steps.append(train_step)