def evaluate(self, data_sentences, data_chars, data_tags, data_labels, ignore_label=None, batch_size=64, simple_compute=True): """ Args: data_sentences, data_tags, data_labels: np.array ignore_label: int, 负例的编号,或者None simple_compute: bool, 是否画出性能详细指标表格 Return: p, r, f1 """ pre_labels = [] if int(len(data_labels) / batch_size) == (len(data_labels) * 1.0 / batch_size): nb_dev = int(len(data_labels) / batch_size) else: nb_dev = int(len(data_labels) / batch_size) + 1 for i in tqdm(range(nb_dev)): sentences_feed = data_sentences[i * batch_size:(i + 1) * batch_size] tags_feed = data_tags[i * batch_size:(i + 1) * batch_size] labels_feed = data_labels[i * batch_size:(i + 1) * batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.label_ph: labels_feed, self.keep_prob_ph: 1.0, self.word_keep_prob_ph: 1.0, self.tag_keep_prob_ph: 1.0 } if config.use_chars: char_ids = data_chars[i * batch_size:(i + 1) * batch_size] char_feed, word_lengths_feed = cnn_bilstm_load_data.pad_sequences( char_ids, config.MAX_LEN, pad_tok=0, nlevels=2) feed_dict[self.input_char_ph] = char_feed feed_dict[self.word_lengths] = word_lengths_feed pre_temp = self.sess.run(self.pre_op, feed_dict=feed_dict) pre_labels += list(pre_temp) right_labels = data_labels[:len(pre_labels)] pre, rec, f = sim_compute(pre_labels, right_labels, ignore_label=ignore_label) return pre, rec, f
def predict(self, data_sentences, data_chars, data_tags, batch_size=50): """ Args: data_sentences, data_tags: np.array batch_size: int Return: pre_labels: list """ pre_labels = [] pre_proba = [] nb_test = 0 if int(len(data_sentences) / batch_size) == (len(data_sentences) * 1.0 / batch_size): nb_test = int(len(data_sentences) / batch_size) else: nb_test = int(len(data_sentences) / batch_size) + 1 for i in range(nb_test): sentences_feed = data_sentences[i * batch_size:(i + 1) * batch_size] tags_feed = data_tags[i * batch_size:(i + 1) * batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.keep_prob_ph: 1.0, self.word_keep_prob_ph: 1.0, self.tag_keep_prob_ph: 1.0 } if config.use_chars: char_ids = data_chars[i * batch_size:(i + 1) * batch_size] char_feed, word_lengths_feed = cnn_bilstm_load_data.pad_sequences( char_ids, config.MAX_LEN, pad_tok=0, nlevels=2) feed_dict[self.input_char_ph] = char_feed feed_dict[self.word_lengths] = word_lengths_feed #pre_temp,pre_out = self.sess.run([self.pre_op, self.pre_ouput_op],feed_dict=feed_dict) pre_proba_tmp, pre_temp = self.sess.run( [self.proba_op, self.pre_op], feed_dict=feed_dict) pre_labels += list(pre_temp) pre_proba += list(pre_proba_tmp) return pre_labels, pre_proba
def fit_all(self, sentences_train, chars_train, tags_train, labels_train, batch_size=64, nb_epoch=40, keep_prob=1.0, word_keep_prob=1.0, tag_keep_prob=1.0, seed=137): """ fit model Args: sentences_train, tags_train, labels_train: 训练数据 sentences_dev, tags_dev, labels_dev: 开发数据 batch_size: int, batch size nb_epoch: int, 迭代次数 keep_prob: float between [0, 1], 全连接层前的dropout word_keep_prob: float between [0, 1], 词向量层dropout tag_keep_prob: float between [0, 1], 标记向量层dropout """ self.saver = tf.train.Saver() self.nb_epoch_scores = [] # 存放nb_epoch次迭代的f值 n_total = len(labels_train) nb_train = 0 if int( n_total / batch_size) == (n_total *1.0 / batch_size): nb_train = int(n_total/ batch_size) else : nb_train = int(n_total / batch_size) + 1 for step in range(nb_epoch): print('Epoch %d / %d:' % (step+1, nb_epoch)) # train total_loss = 0. for i in tqdm( range(nb_train) ): # for i in range(nb_train): #start = i * batch_size % n_total if (i+1)*batch_size >= n_total: sentences_feed = sentences_train[i * batch_size:] + sentences_train[0:(i+1) * batch_size-n_total] tags_feed = tags_train[i * batch_size:] + tags_train[0:(i+1) * batch_size-n_total] labels_feed = labels_train[i * batch_size:] + labels_train[0:(i+1) * batch_size-n_total] char_ids = chars_train[i * batch_size:] + chars_train[0:(i+1) * batch_size-n_total] else : sentences_feed = sentences_train[i*batch_size:(i+1)*batch_size] tags_feed = tags_train[i*batch_size:(i+1)*batch_size] labels_feed = labels_train[i*batch_size:(i+1)*batch_size] char_ids = chars_train[i*batch_size:(i+1)*batch_size] feed_dict = { self.input_sentence_ph: sentences_feed, self.input_tag_ph: tags_feed, self.label_ph: labels_feed, self.keep_prob_ph: keep_prob, self.word_keep_prob_ph: word_keep_prob, self.tag_keep_prob_ph: tag_keep_prob, } if config.use_chars: #char_ids = chars_train[i*batch_size:(i+1)*batch_size] char_feed, word_lengths_feed = cnn_bilstm_load_data.pad_sequences(char_ids, config.MAX_LEN, pad_tok=0,nlevels=2) feed_dict[self.input_char_ph] = char_feed feed_dict[self.word_lengths] = word_lengths_feed _, loss_value = self.sess.run( [self.train_op, self.loss], feed_dict=feed_dict) total_loss += loss_value total_loss /= float(nb_train) # 计算在训练集、开发集、测试集上的性能 p_train, r_train, f_train = self.evaluate(sentences_train,chars_train, tags_train, labels_train,batch_size=batch_size) #pre_labels = self.predict(sentences_test, tags_test) #with codecs.open('./Data/result/epoch_%d.csv' % (step+1), 'w', encoding='utf-8') as file_w: #for num, label in enumerate(pre_labels): #file_w.write('%d,%s\n' % (num+1, self._label_voc_rev[label])) print('\tloss=%f, p_train=%f, r_train=%f, f_train=%f' % (total_loss,p_train, r_train, f_train)) self.nb_epoch_scores.append([total_loss,p_train, f_train]) self.saver.save(self.sess, config.TRAIN_ALL_MODEL) print('model has saved to %s' % config.TRAIN_ALL_MODEL) return self.nb_epoch_scores