def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=False, max_gradient_norm=5.0): self.decay = decay print 'Training preparation...' print 'Defining loss...' loss = [] if self.crf > 0: loss_function = losses.crf_loss for i in range(len(self.input_v)): bucket_loss = losses.loss_wrapper(self.output[i], self.output_[i], loss_function, transitions=self.transition_char, nums_tags=self.nums_tags, batch_size=self.real_batches[i]) loss.append(bucket_loss) else: loss_function = losses.sparse_cross_entropy for output, output_ in zip(self.output, self.output_): bucket_loss = losses.loss_wrapper(output, output_, loss_function) loss.append(bucket_loss) l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder') self.l_rate = l_rate if optimizer == 'sgd': if momentum is None: optimizer = tf.train.GradientDescentOptimizer(learning_rate=l_rate) else: optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum) elif optimizer == 'adagrad': assert lr_v is not None optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate) elif optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=l_rate) else: raise Exception('optimiser error') self.train_step = [] print 'Computing gradients...' for idx, l in enumerate(loss): t2 = time() if clipping: gradients = tf.gradients(l, self.params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) train_step = optimizer.apply_gradients(zip(clipped_gradients, self.params)) else: train_step = optimizer.minimize(l) print 'Bucket %d, %f seconds' % (idx + 1, time() - t2) self.train_step.append(train_step)
def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=True, max_gradient_norm=5.0): self.decay = decay print 'Training preparation...' print 'Defining loss...' loss = [] if self.crf > 0: loss_function = losses.crf_loss for i in range(len(self.input_v1)): bucket_loss = [] #pdb.set_trace() for j in range(self.num_gpus): #pdb.set_trace() with tf.device('/gpu:{}'.format(j)): output_ = self.output_[i][0][j * self.real_batches[i]:(j + 1) * self.real_batches[i]] gpu_loss = losses.loss_wrapper( [self.output[i][0][j]], [output_], loss_function, transitions=[self.transition_char], nums_tags=[self.nums_tags], batch_size=self.real_batches[i]) bucket_loss.append(gpu_loss) loss.append(bucket_loss) #else: # loss_function = losses.sparse_cross_entropy # for output, output_ in zip(self.output, self.output_): # bucket_loss = losses.loss_wrapper(output, output_, loss_function) # loss.append(bucket_loss) l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder') self.l_rate = l_rate if optimizer == 'sgd': if momentum is None: optimizer = tf.train.GradientDescentOptimizer( learning_rate=l_rate) else: optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum) elif optimizer == 'adagrad': assert lr_v is not None optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate) elif optimizer == 'adam': optimizer = tf.train.AdamOptimizer() else: raise Exception('optimiser error') self.train_step = [] print 'Computing gradients...' for idx, l in enumerate(loss): t2 = time() tower_grads = [] for i in range(self.num_gpus): with tf.device('/gpu:{}'.format(i)): if clipping: gradients = tf.gradients(l[i], self.params) tower_grads.append(gradients) #pdb.set_trace() grads = self.average_gradients(tower_grads) pdb.set_trace() clipped_gradients, norm = tf.clip_by_global_norm( grads, max_gradient_norm) train_step = optimizer.apply_gradients( list(zip(clipped_gradients, self.params))) #else: # train_step = optimizer.minimize(l) print 'Bucket %d, %f seconds' % (idx + 1, time() - t2) self.train_step.append(train_step)
def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=False, max_gradient_norm=5.0): """ :param optimizer: 优化函数,Adagrad :param decay: 学习率衰减率,0.05 :param lr_v: 学习率,0.1 :param momentum: :param clipping: 是否运用梯度裁剪(给梯度设置最大阈值) :param max_gradient_norm: """ self.decay = decay print 'Training preparation...' print 'Defining loss...' if self.crf > 0: loss_function = losses.crf_loss for i in range(len(self.input_v)): # 根据第 i 个 bucket 的输出和 ground truth,用 CRF 损失函数,计算损失函数值 tagging_loss, lm_loss = losses.loss_wrapper(self.output[i], self.output_[i], self.lm_predictions[i], self.lm_groundtruthes[i], loss_function, transitions=self.transition_char, nums_tags=self.nums_tags, batch_size=self.real_batches[i]) tagging_loss_summary = tf.summary.scalar('tagging loss %s' % i, tf.reduce_mean(tagging_loss)) lm_loss_summary = tf.summary.scalar('language model loss %s' % i, tf.reduce_mean(lm_loss)) self.losses.append(tagging_loss + lm_loss) self.summaries.append([tagging_loss_summary, lm_loss_summary]) else: loss_function = losses.sparse_cross_entropy for output, output_ in zip(self.output, self.output_): bucket_loss = losses.loss_wrapper(output, output_, loss_function) self.losses.append(bucket_loss) l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder') self.l_rate = l_rate if optimizer == 'sgd': if momentum is None: optimizer = tf.train.GradientDescentOptimizer(learning_rate=l_rate) else: optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum) elif optimizer == 'adagrad': assert lr_v is not None optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate) elif optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=l_rate) else: raise Exception('optimiser error') self.train_steps = [] print 'Computing gradients...' for idx, l in enumerate(self.losses): t2 = time() if clipping: gradients = tf.gradients(l, self.params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) train_step = optimizer.apply_gradients(zip(clipped_gradients, self.params)) else: train_step = optimizer.minimize(l) print 'Bucket %d, %f seconds' % (idx + 1, time() - t2) self.train_steps.append(train_step) self.merged_summary = tf.summary.merge_all()
def config(self, optimizer, decay, lr_v=None, momentum=None, clipping=False, max_gradient_norm=5.0): """ :param optimizer: 优化函数,Adagrad :param decay: 学习率衰减率,0.05 :param lr_v: 学习率,0.1 :param momentum: :param clipping: 是否运用梯度裁剪(给梯度设置最大阈值) :param max_gradient_norm: """ self.decay = decay print 'Training preparation...' print 'Defining loss...' if self.crf > 0: for i in range(len(self.input_v)): # 根据第 i 个 bucket 的输出和 ground truth,用 CRF 损失函数,计算损失函数值 tagging_loss = losses.loss_wrapper( self.output[i], self.output_[i], losses.crf_loss, transitions=self.transition_char, nums_tags=self.nums_tags, batch_size=self.real_batches[i]) tagging_loss_summary = tf.summary.scalar( 'tagging loss %s' % i, tf.reduce_mean(tagging_loss)) loss = tagging_loss loss_summary = [tagging_loss_summary] if self.co_train: lm_loss = [] masks = tf.reshape(tf.cast(tf.sign(self.output_[i]), dtype=tf.float32), shape=[-1, self.buckets_char[i]]) for lm_fw_y, lm_fw_y_, lm_bw_y, lm_bw_y_ in zip( self.lm_fw_predictions[i], self.lm_fw_groundtruthes[i], self.lm_bw_predictions[i], self.lm_bw_groundtruthes[i]): lm_fw_loss = tf.contrib.seq2seq.sequence_loss( lm_fw_y, lm_fw_y_, masks) lm_bw_loss = tf.contrib.seq2seq.sequence_loss( lm_bw_y, lm_bw_y_, masks) # lm_fw_loss = tf.reduce_sum(losses.sparse_cross_entropy(lm_fw_y, lm_fw_y_) * masks) # lm_bw_loss = tf.reduce_sum(losses.sparse_cross_entropy(lm_bw_y, lm_bw_y_) * masks) lm_loss.append(lm_fw_loss + lm_bw_loss) lm_loss = tf.stack(lm_loss) lm_loss_summary = tf.summary.scalar( 'language model loss %s' % i, tf.reduce_mean(lm_loss)) loss += self.lambda0 * lm_loss loss_summary.append(lm_loss_summary) if self.char_freq_loss: freq_loss = [] masks = tf.cast(tf.sign(self.output_[i]), dtype=tf.float32) for freq_y, freq_y_ in zip( self.char_freq_predictions[i], self.char_freq_groundtruthes[i]): freq_loss.append( tf.losses.mean_squared_error( freq_y_, tf.reshape(freq_y, tf.shape(freq_y_)), weights=tf.reshape(masks, tf.shape(freq_y_)))) freq_loss = tf.stack(freq_loss) freq_loss_summary = tf.summary.scalar( 'char freq loss %s' % i, tf.reduce_mean(freq_loss)) loss += self.lambda1 * freq_loss loss_summary.append(freq_loss_summary) self.losses.append(loss) self.summaries.append(loss_summary) else: # todo loss_function = losses.sparse_cross_entropy for output, output_ in zip(self.output, self.output_): bucket_loss = losses.loss_wrapper(output, output_, loss_function) self.losses.append(bucket_loss) l_rate = tf.placeholder(tf.float32, [], name='learning_rate_holder') self.l_rate = l_rate if optimizer == 'sgd': if momentum is None: optimizer = tf.train.GradientDescentOptimizer( learning_rate=l_rate) else: optimizer = tf.train.MomentumOptimizer(learning_rate=l_rate, momentum=momentum) elif optimizer == 'adagrad': assert lr_v is not None optimizer = tf.train.AdagradOptimizer(learning_rate=l_rate) elif optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=l_rate) else: raise Exception('optimiser error') self.train_steps = [] print 'Computing gradients...' for idx, l in enumerate(self.losses): t2 = time() if clipping: gradients = tf.gradients(l, self.params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) train_step = optimizer.apply_gradients( zip(clipped_gradients, self.params)) else: train_step = optimizer.minimize(l) print 'Bucket %d, %f seconds' % (idx + 1, time() - t2) self.train_steps.append(train_step)