def run_evaluate(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for words, labels in minibatches(test, self.config.batch_size): labels_pred, sequence_lengths = self.predict_batch(sess, words) for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT)) lab_pred_chunks = set( get_chunks(lab_pred, tags, self.config.DEFAULT)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def run_epoch(self, sess, train, dev, tags, epoch): """ Performs one complete pass over the train set and evaluate on dev Args: sess: tensorflow session train: dataset that yields tuple of sentences, tags dev: dataset tags: {tag: index} dictionary epoch: (int) number of the epoch """ nbatches = ( len(train) + self.config.batch_size - 1) // self.config.batch_size prog = Progbar(target=nbatches) for i, (words, labels ) in enumerate(minibatches(train, self.config.batch_size)): fd, _ = self.get_feed_dict(words, labels, self.config.LR, self.config.dropout) _, train_loss, summary = sess.run( [self.train_op, self.loss, self.merged], feed_dict=fd) prog.update(i + 1, [("train loss", train_loss)]) # tensorboard if i % 10 == 0: self.file_writer.add_summary(summary, epoch * nbatches + i) acc, f1 = self.run_evaluate(sess, dev, tags) self.logger.info( "- dev acc {:04.2f} - f1 {:04.2f}".format(100 * acc, 100 * f1)) return acc, f1
def train(model, training_data, optimizer): model.train() num_batches = math.ceil(len(training_data) / args.batch_size) bar = utils.Progbar(target=num_batches) train_loss = 0.0 train_total_instances = 0 for batch_id, batch in enumerate( utils.minibatches(training_data, args.batch_size)): model.zero_grad() for sentence, tags in batch: sentence_in = processor.tensor(sentence) targets = processor.tensor(tags) loss = model.neg_log_likelihood(sentence_in, targets) loss.backward() train_loss += loss train_total_instances += 1 optimizer.step() bar.update(batch_id + 1, exact=[("train loss", train_loss / train_total_instances)]) if args.save_checkpoint: save_model(model)
def evaluate(model, eval_data, dataset_name): model.eval() num_batches = math.ceil(len(eval_data) / args.batch_size) bar = utils.Progbar(target=num_batches) eval_score = 0.0 eval_total_instances = 0 eval_total_characters = 0 eval_correct_characters = 0 with torch.no_grad(): for batch_id, batch in enumerate( utils.minibatches(eval_data, args.batch_size)): for sentence, tags in batch: score, tag_out = model(processor.tensor(sentence)) if len(tag_out) != len(tags): raise IndexError( 'Size of output tag sequence differs from that of reference.' ) length = len(tags) correct = [ tag_out[i] == tags[i] for i in range(1, length - 1) ].count(1) eval_score += score eval_total_instances += 1 eval_total_characters += length eval_correct_characters += correct bar.update(batch_id + 1, exact=[("eval score", eval_score / eval_total_instances) ]) logger.info('{} dataset accuracy: {}'.format( dataset_name, eval_correct_characters / eval_total_characters))
def run(self, data): predicts = [] bar = ProgressBar(max_value=len(data) // 1024 + 1) for batch_data in bar(utils.minibatches(data, 1024, False)): predict = self.model.eval_step(self.sess, batch_data) predicts.extend(predict) print 'The model is finished!' return predicts
def test(): x, y = utils.read_file(is_train=True, label_list=['人类作者', '自动摘要']) x = utils.process(x) x = utils.truncation(x) word2id, id2word, tag2id, id2tag = utils.build_vocab(x, y, min_df=10) x = utils.build_x_ids(x, word2id) y = utils.build_y_ids(y, tag2id) data = zip(x, y) train_data, dev_data = train_test_split(data, test_size=10000, random_state=24) vocab_size = len(word2id) emb_dim = 100 num_classes = len(tag2id) print "训练集数据大小:%d 验证集数据大小:%d" % (len(train_data), len(dev_data)) print "vocab_size:%d num_classes:%d" % (vocab_size, num_classes) print FLAGS.model_name model_dir = os.path.join('temp', 'nn') if not os.path.exists(model_dir): os.mkdir(model_dir) with tf.Session() as sess: model = getattr(models, FLAGS.model_name)(vocab_size, emb_dim, num_classes) saver = tf.train.Saver(tf.global_variables()) model_file = os.path.join('temp', 'nn', FLAGS.model_file) saver.restore(sess, model_file) print "Restore model from %s" % model_file dev_loss = [] labels = [] predicts = [] bar = ProgressBar(max_value=len(dev_data) // FLAGS.batch_size + 1) for batch_data in bar( utils.minibatches(dev_data, FLAGS.batch_size, True, shuffle=False)): loss, predict = model.dev_step(sess, batch_data) dev_loss.append(loss) labels.extend(batch_data[1]) predicts.extend(predict) dev_loss = np.mean(dev_loss) dev_f1 = utils.score_all(labels, predicts, tag2id) utils.error_print(predicts, labels, id2tag, zip(*dev_data)[0], id2word) print "loss:%.3f f1:%.3f" % (dev_loss, dev_f1)
def run(self, sess, train, dev, epoch): nbatches = (len(train) + self.config.batch_size - 1) / self.config.batch_size for i, (word_ids, labels) in enumerate(minibatches(train, self.config.batch_size)): feed, sequence_lengths = self.get_feed(word_ids=word_ids, batch_id=i * self.config.batch_size, labels=labels, lr=self.config.lr, dropout=self.config.dropout, training=True) #print (np.asarray(feed[self.word_feats])) _, train_loss = sess.run([self.train_, self.loss], feed_dict=feed) self.logger.info("Train loss: %f" % train_loss) acc, f05 = self.performance_eval(sess, dev, is_dev=True) self.logger.info("dev accuracy: %f, f05: %f" % (acc, f05)) return acc, f05
def run_infer(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ infer_res = open(self.config.infer_filename, 'w', encoding="utf-8-sig") accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. for words, labels in minibatches(test, self.config.batch_size): words_copy = copy.deepcopy(words) labels_pred, sequence_lengths = self.predict_batch(sess, words) # print("predict_batch", labels_pred, sequence_lengths,words_copy) if self.config.chars: _, words_res = zip(*words_copy) else: words_res = words_copy for word_res, lab, lab_pred, length in zip( words_res, labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] # print("idx_restore", word_res, lab, lab_pred) infer_res.write(self.idx_restore(word_res, lab, lab_pred)) accs += [a == b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags, self.config.DEFAULT)) lab_pred_chunks = set( get_chunks(lab_pred, tags, self.config.DEFAULT)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) infer_res.close() p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def performance_eval(self, sess, test, is_dev=True): accs, all_labels, all_labels_pred = [], [], [] main_predicted_count, main_total_count, main_correct_count = 0., 0., 0. for i, (word_ids, labels) in enumerate(minibatches(test, self.config.batch_size)): labels_pred, sequence_lengths = self.predict_batch( sess, word_ids, i * self.config.batch_size, is_dev=is_dev, is_training=False) all_labels_pred.append(labels_pred) all_labels.append(labels) for lab, lab_pred, length in zip(labels, labels_pred, sequence_lengths): lab = lab[:length] lab_pred = lab_pred[:length] accs += map(lambda (a, b): a == b, zip(lab, lab_pred)) main_predicted_count += sum( map(lambda (a): a == self.config.main_label, lab_pred)) main_total_count += sum( map(lambda (a): a == self.config.main_label, lab)) main_correct_count += sum( map( lambda (a, b): (a == self.config.main_label) * (b == self.config.main_label), zip(lab_pred, lab))) acc = np.mean(accs) p = (float(main_correct_count) / float(main_predicted_count)) if ( main_predicted_count > 0) else 0.0 r = (float(main_correct_count) / float(main_total_count)) if (main_total_count > 0) else 0.0 f = (2.0 * p * r / (p + r)) if (p + r > 0.0) else 0.0 f05 = ((1 + 0.5 * 0.5) * p * r / ((0.5 * 0.5 * p) + r)) if (p + r > 0.0) else 0.0 return acc, f05
def cost(self, X, minibatch_size=20): if self.phase == 0: val = [0, 0] else: val = 0 data_size = X.shape[0] for Xb in utils.minibatches(minibatch_size, X, shuffle_f=False): eb = np.asarray(np.random.randn(Xb.shape[0], self.n_hidden), dtype=theano.config.floatX) zb = np.asarray(np.random.randn(Xb.shape[0], self.n_hidden), dtype=theano.config.floatX) if self.phase == 0: c = self.early_cost_func(Xb, eb, zb) val[0] += c[0] * float(Xb.shape[0]) \ / float(data_size) val[1] += c[1] * float(Xb.shape[1]) \ / float(data_size) else: val += self.final_cost_func( Xb, eb, zb ) * float(Xb.shape[0]) \ / float(data_size) return val
def cost( self, X, minibatch_size = 20 ): if self.phase == 0: val = [0,0] else: val = 0 data_size = X.shape[0] for Xb in utils.minibatches( minibatch_size, X, shuffle_f=False ): eb = np.asarray( np.random.randn( Xb.shape[0], self.n_hidden ), dtype = theano.config.floatX ) zb = np.asarray( np.random.randn( Xb.shape[0], self.n_hidden ), dtype = theano.config.floatX ) if self.phase == 0: c = self.early_cost_func( Xb, eb, zb ) val[0] += c[0] * float(Xb.shape[0]) \ / float(data_size) val[1] += c[1] * float(Xb.shape[1]) \ / float(data_size) else: val += self.final_cost_func( Xb, eb, zb ) * float(Xb.shape[0]) \ / float(data_size) return val
logger.info("Number dev instances: {}".format(len(dev_instances))) training_total_tokens = 0 best_f1 = 0. for epoch in range(int(options.num_epochs)): logger.info("Epoch {} out of {}".format(epoch + 1, options.num_epochs)) random.shuffle(training_instances) train_loss = 0.0 train_total_instance = 0 # size of trained instances if options.dropout > 0: model.set_dropout(options.dropout) nbatches = (len(training_instances) + options.batch_size - 1) // options.batch_size bar = utils.Progbar(target=nbatches) for batch_id, batch in enumerate(utils.minibatches(training_instances, options.batch_size)): for idx, instance in enumerate(batch): if len(instance.sentence) == 0: continue train_total_instance += 1 loss_expr = model.neg_log_loss(instance.sentence, instance.tags) # Forward pass loss = loss_expr.scalar_value() # Do backward pass loss_expr.backward() # Bail if loss is NaN if math.isnan(loss): assert False, "NaN occured" train_loss += loss
def train(): x, y = utils.load_data(True, True) word2id, id2word, tag2id, id2tag = utils.build_vocab(x, y, min_df=20) x = utils.build_x_ids(x, word2id) y = utils.build_y_ids(y, tag2id) data = zip(x, y) train_data, dev_data = train_test_split(data, test_size=10000, random_state=24) #pre_embeddings=utils.load_embeddings(word2id) vocab_size = len(word2id) emb_dim = 100 num_classes = len(tag2id) print "训练集数据大小:%d 验证集数据大小:%d" % (len(train_data), len(dev_data)) print "vocab_size:%d num_classes:%d" % (vocab_size, num_classes) print FLAGS.model_name model_dir = os.path.join('temp', 'nn') if not os.path.exists(model_dir): os.mkdir(model_dir) with tf.Session() as sess: model = getattr(models, FLAGS.model_name)(vocab_size, emb_dim, num_classes) saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # sess.run(model.embeddings.assign(pre_embeddings)) print "Train start!" best_dev_f1 = 0 best_dev_epoch = 0 no_improve = 0 for epoch in range(FLAGS.max_epoch): bar = ProgressBar(max_value=len(train_data) // FLAGS.batch_size + 1) train_loss = [] labels = [] predicts = [] for batch_data in bar( utils.minibatches(train_data, FLAGS.batch_size, True)): loss, predict = model.train_step(sess, batch_data) train_loss.append(loss) labels.extend(batch_data[1]) predicts.extend(predict) train_loss = np.mean(train_loss) train_f1 = utils.score_all(labels, predicts, tag2id) print "Train epoch %d finished. loss:%.3f f1:%.3f" % ( epoch, train_loss, train_f1) dev_loss = [] labels = [] predicts = [] bar = ProgressBar(max_value=len(train_data) // FLAGS.batch_size + 1) for batch_data in bar( utils.minibatches(train_data, FLAGS.batch_size, True)): loss, predict = model.dev_step(sess, batch_data) dev_loss.append(loss) labels.extend(batch_data[1]) predicts.extend(predict) dev_loss = np.mean(dev_loss) dev_f1 = utils.score_all(labels, predicts, tag2id) print "Train epoch %d finished. loss:%.3f f1:%.3f" % ( epoch, dev_loss, dev_f1) dev_loss = [] labels = [] predicts = [] for batch_data in utils.minibatches(dev_data, FLAGS.batch_size, True): loss, predict = model.dev_step(sess, batch_data) dev_loss.append(loss) labels.extend(batch_data[1]) predicts.extend(predict) dev_loss = np.mean(dev_loss) dev_f1 = utils.score_all(labels, predicts, tag2id) print "Dev epoch %d finished. loss:%.3f f1:%.3f" % ( epoch, dev_loss, dev_f1) if dev_f1 > best_dev_f1: best_dev_f1 = dev_f1 best_dev_epoch = epoch no_improve = 0 saver.save(sess, os.path.join(model_dir, FLAGS.model_file)) print '保存模型!' else: no_improve += 1 if no_improve >= 5: print "停止训练!" break print print "Best epoch %d best f1: %.3f" % (best_dev_epoch, best_dev_f1)
best_f1 = 0. for epoch in range(int(options.num_epochs)): logger.info("Epoch {} out of {}".format(epoch + 1, options.num_epochs)) random.shuffle(training_instances) train_loss = 0.0 train_total_instance = 0 # size of trained instances if options.dropout > 0: model.set_dropout(options.dropout) nbatches = (len(training_instances) + options.batch_size - 1) // options.batch_size bar = utils.Progbar(target=nbatches) for batch_id, batch in enumerate( utils.minibatches(training_instances, options.batch_size)): for idx, instance in enumerate(batch): if len(instance.sentence) == 0: continue train_total_instance += 1 loss_expr = model.neg_log_loss(instance.sentence, instance.tags) # Forward pass loss = loss_expr.scalar_value() # Do backward pass loss_expr.backward() # Bail if loss is NaN if math.isnan(loss): assert False, "NaN occured"
def train(): source_data, target_data, test_data, word2id = utils.load_data() embeddings = utils.load_embeddings(word2id) random.seed(1) random.shuffle(target_data) cv_losses = [] for k in range(1, 11): train_data, dev_data = utils.train_dev_split(target_data, k) model_file = FLAGS.model_file + str(k) print model_file print "训练集1数据大小:%d" % len(source_data) print "训练集2数据大小:%d" % len(train_data) print "验证集数据大小:%d" % len(dev_data) print "embedding大小:(%d,%d)" % (embeddings.shape[0], embeddings.shape[1]) model_dir = '../model' graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): model = getattr(models, FLAGS.model_name)(embeddings) saver = tf.train.Saver(tf.global_variables()) if FLAGS.restore == 1: saver.restore(sess, os.path.join(model_dir, FLAGS.model_file)) print "Restore from pre-trained model" else: sess.run(tf.global_variables_initializer()) print "Train start!" best_loss = 1e6 best_epoch = 0 not_improved = 0 for epoch in range(FLAGS.max_epoch): print epoch, "================================================" train_loss = [] ground_trues = [] predicts = [] for batch_data in utils.minibatches2(source_data, train_data, FLAGS.batch_size, ratio=1, mode='train'): loss, predict = model.train_step(sess, batch_data[:3], batch_data[3]) train_loss.extend(loss) predicts.extend(predict) ground_trues.extend(batch_data[2]) train_loss = utils.loss(ground_trues, train_loss) p, r, f1 = utils.score(ground_trues, predicts) print "%d-fold Train epoch %d finished. loss:%.4f p:%.4f r:%.4f f1:%.4f" % ( k, epoch, train_loss, p, r, f1) valid_loss = [] ground_trues = [] predicts = [] for batch_data in utils.minibatches(dev_data, FLAGS.batch_size, mode='dev'): loss, predict = model.valid_step(sess, batch_data, 2) valid_loss.extend(loss) predicts.extend(predict) ground_trues.extend(batch_data[2]) valid_loss = utils.loss(ground_trues, valid_loss) p, r, f1 = utils.score(ground_trues, predicts) print "%d-fold,Valid epoch %d finished. loss:%.4f p:%.4f r:%.4f f1:%.4f" % ( k, epoch, valid_loss, p, r, f1) if valid_loss < best_loss: best_loss = valid_loss best_epoch = epoch not_improved = 0 print "save model!" saver.save(sess, os.path.join(model_dir, model_file)) else: not_improved += 1 if not_improved > 4: print "停止训练!" break print print "Best epoch %d best loss %.4f" % (best_epoch, best_loss) print "#########################################################" cv_losses.append(best_loss) print "final cv loss: %.4f" % (sum(cv_losses) / len(cv_losses))
def run(self,test_data): predicts = [] for batch_data in utils.minibatches(test_data, 128, mode='test'): predict =self.model.infer_step(self.sess, batch_data,2) predicts.extend(predict) return predicts
data_train, labels_train, _, _, _, _ = load_mnist_data( ) # the data is 55k samples widgets = [ 'Training: ', Percentage(), ' ', AnimatedMarker(markers='←↖↑↗→↘↓↙'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=n_epochs * data_train.shape[0] // 32).start() i = 0 with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for epoch in range(n_epochs): batches = minibatches(data_train, labels_train, batch_size=32) for data, _ in batches: pbar.update(i) i += 1 random_vectors = urand_vector(shape=(32, 100)) sess.run(train_gen, feed_dict={ inputs: random_vectors, real_images: np.expand_dims(data, 3) }) sess.run(train_dist, feed_dict={ inputs: random_vectors, real_images: np.expand_dims(data, 3) }) plt.imshow(conv3_out.eval()[0, :, :, 0])
def run_evaluate(self, sess, test, tags): """ Evaluates performance on test set Args: sess: tensorflow session test: dataset that yields tuple of sentences, tags tags: {tag: index} dictionary Returns: accuracy f1 score """ accs = [] correct_preds, total_correct, total_preds = 0., 0., 0. output_file = codecs.open("output", 'w', 'UTF-8') idx_to_tag = {idx: tag for tag, idx in tags.items()} for words, labels, iob_gold, mention_type_gold, mentions_gold, word_features, char_features in minibatches(test, self.config.batch_size): iob_labels_pred, sequence_lengths= self.predict_iob_batch(sess, words, word_features, char_features) mentions = [] mention_sizes = [] count = 0 for i in range(self.config.batch_size): length = sequence_lengths[i] mention = find_mentions(iob_labels_pred[i][:length]) mentions.append(mention) mention_sizes.append(len(mention)) if len(mention) == 0: count += 1 if count != self.config.batch_size: mentions_pred, _ = self.predict_type_batch(sess, words, word_features, char_features, mentions) else: mentions_pred = [[]]*self.config.batch_size for lab, iob_pred, length, mention, mention_pred, mention_size in zip(labels, iob_labels_pred, sequence_lengths, mentions, mentions_pred, mention_sizes): lab = lab[:length] iob_pred = iob_pred[:length] mention_pred = mention_pred[:mention_size] lab_pred = find_labels(iob_pred, mention_pred, tags, self.id2type) accs += [a==b for (a, b) in zip(lab, lab_pred)] lab_chunks = set(get_chunks(lab, tags)) lab_pred_chunks = set(get_chunks(lab_pred, tags)) correct_preds += len(lab_chunks & lab_pred_chunks) total_preds += len(lab_pred_chunks) total_correct += len(lab_chunks) output_string = "" for b, c in zip(lab, lab_pred): split_line = [] split_line.append(idx_to_tag[b]) split_line.append(idx_to_tag[c]) output_string += ' '.join(split_line) + '\n' output_file.write(output_string+'\n') p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 acc = np.mean(accs) return acc, f1
def run_epoch(self, sess, train, dev, tags, epoch): """ Performs one complete pass over the train set and evaluate on dev Args: sess: tensorflow session train: dataset that yields tuple of sentences, tags dev: dataset tags: {tag: index} dictionary epoch: (int) number of the epoch """ nbatches = (len(train) + self.config.batch_size - 1) // self.config.batch_size total_loss = 0.0 count = 0 for i, (words, labels, iob, mention_type, mentions, word_features, char_features) in enumerate(minibatches(train, self.config.batch_size)): if len(mentions[0]) == 0: fd, _, _ = self.get_feed_dict(words, word_features, char_features, self.config.lr, self.config.dropout, iob) logits, _, train_loss= sess.run([self.boundry_logits, self.train_op_boundry, self.loss_a], feed_dict=fd) else: fd, _, _ = self.get_feed_dict(words, word_features, char_features, self.config.lr, self.config.dropout, iob, mention_type, mentions) logits, _, a, b, train_loss= sess.run([self.boundry_logits, self.train_op, self.loss_a, self.loss_b, self.loss], feed_dict=fd) total_loss += train_loss count += 1 print total_loss/count acc, f1 = self.run_evaluate(sess, dev, tags) self.logger.info("- dev acc {:04.2f} - f1 {:04.2f}".format(100*acc, 100*f1)) return acc, f1
validation_acc = [] ############################################################################ # Train the net # ############################################################################ widgets = [ 'Training: ', Percentage(), ' ', AnimatedMarker(markers='←↖↑↗→↘↓↙'), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=n_epochs).start() d_train, l_train, d_test, l_test, d_val, l_val = load_svhn_data( normalize=True) batches = minibatches(d_train, l_train, batch_size=batch_size) training_step_accuracy = [] val_accuracy = [] save_file = "./exercise5.ckpt" plt.ion() plt.gca().set_ylim([0, 1]) plt.gca().set_xlim([0, n_epochs / 30]) with tf.Session() as sess: saver = tf.train.Saver() if os.path.exists(save_file): saver.restore(sess, save_file) else: sess.run(tf.initialize_all_variables()) for i in range(n_epochs): pbar.update(i)