def evaluate(self, z_data, z_labels, test_data=False): self.model.eval() loss_total = 0 predict_all = np.array([], dtype=int) labels_all = np.array([], dtype=int) with torch.no_grad(): dataset_batch_generator = batch_iter(z_data, z_labels, self.model_config.batch_size) batch_count = 0 for (data, labels) in dataset_batch_generator: batch_input = torch.LongTensor(data).to(self.device) outputs = self.model(batch_input) batch_label = torch.LongTensor(labels).to(self.device) val_loss = F.cross_entropy(outputs, batch_label) loss_total += val_loss labels = torch.LongTensor(labels).numpy() max_value, max_index = torch.max(outputs, axis=1) predict = max_index.cpu().numpy() labels_all = np.append(labels_all, labels) predict_all = np.append(predict_all, predict) batch_count += 1 acc = metrics.accuracy_score(labels_all, predict_all) if test_data: report = metrics.classification_report(labels_all, predict_all) confusion = metrics.confusion_matrix(labels_all, predict_all) return acc, loss_total / batch_count, report, confusion return acc, loss_total / batch_count
def train(self, sess, x, y, seq_len, batch_size, epoch_size): learning_rate = 1e-3 for epoch in range(epoch_size): total_cost = 0.0 total_batch = 0 total_acc_num = 0 for batch_x, batch_y, batch_len in batch_iter(x, y, seq_len, batch_size, shuffle=True): total_batch += 1 _, cost_val, acc_cnt = sess.run( [self.train_op, self.cost, self.acc_cnt], feed_dict={ self.in_x: batch_x, self.in_y: batch_y, self.in_len: batch_len, self.learning_rate: learning_rate }) total_acc_num += acc_cnt total_cost += cost_val if total_batch % 30 == 0: print('batch_%d cost_val: %0.5f' % (total_batch, cost_val)) print('Epoch:', '%02d' % (epoch + 1), 'cost_avg =', '%0.5f' % (total_cost / total_batch), 'acc: %0.5f' % (total_acc_num / (0.0 + len(x)))) if epoch + 1 < 4: learning_rate /= (10 * epoch + 10) self.saver.save(sess, config.save_dir + '/rcnn_saver.ckpt', global_step=epoch + 1)
def predict(self, sess, x, y, seq_len): ckpt = tf.train.get_checkpoint_state(config.save_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): self.saver.restore(sess, ckpt.model_checkpoint_path) else: raise RuntimeError('no rcnn model ...') probability = [] for batch_x, batch_y, batch_len in batch_iter(x, y, seq_len, 100): proba = sess.run(self.y_prob, feed_dict={self.in_x: batch_x, self.in_y: batch_y, self.in_len: batch_len}) probability.append(proba) return np.array(probability)
def test(self, sess, x, y, seq_len): ckpt = tf.train.get_checkpoint_state(config.save_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): self.saver.restore(sess, ckpt.model_checkpoint_path) else: raise RuntimeError('no rcnn model ...') total_acc_num = 0 for batch_x, batch_y, batch_len in batch_iter(x, y, seq_len, 100): acc_cnt = sess.run(self.acc_cnt, feed_dict={self.in_x: batch_x, self.in_y: batch_y, self.in_len: batch_len}) total_acc_num += acc_cnt print ('test acc: %0.5f' % (total_acc_num/(0.0+len(x))))
def evaluate(sess, x_, y_): data_len = len(x_) batch_eval = batch_iter(x_, y_, 128) total_loss = 0.0 total_acc = 0.0 for x_batch, y_batch in batch_eval: batch_len = len(x_batch) feed_dict = feed_data(x_batch, y_batch, 1.0) loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict) total_loss += loss * batch_len total_acc += acc * batch_len return total_loss / data_len, total_acc / data_len
def eval(model, x1, x2, y, batch_size, verbose=False): model.eval() loss_func = nn.BCELoss() corrects = 0.0 running_losses = [] batches = data.batch_iter(list(zip(x1, x2, y)), batch_size) for batch in batches: x1_batch, x2_batch, y_batch = zip(*batch) x1_batch = Variable(torch.LongTensor(x1_batch)) x2_batch = Variable(torch.LongTensor(x2_batch)) y_batch = Variable(torch.FloatTensor(y_batch)) if torch.cuda.is_available(): x1_batch = x1_batch.cuda() x2_batch = x2_batch.cuda() y_batch = y_batch.cuda() preds, y_preds = model(x1_batch, x2_batch) loss = loss_func(preds, y_batch) running_losses.append(loss.item()) y_truth = y_batch.byte() y_preds = torch.squeeze(y_preds, -1) # acc = pred.eq(truth).sum() / target.numel() # tmp_corrects = (classes.data == y_truth.data).sum() tmp_corrects = y_preds.eq(y_truth).sum() corrects += tmp_corrects size = len(x1) avg_loss = sum(running_losses) / len(running_losses) accuracy = float(corrects) / float(size) return avg_loss, accuracy
def train(): print("Configuring TensorBoard and Saver...") tensorboard_dir = "./tensorboard" tensorboard_dir = os.path.join(tensorboard_dir, date_time) if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) saver = tf.train.Saver() if not os.path.exists(save_dir): os.makedirs(save_dir) print("Loading training and validation data...") start_time = time.time() x_train, y_train = process_file(train_dir, word2id, cat2id, config.seq_length) x_val, y_val = process_file(val_dir, word2id, cat2id, config.seq_length) time_dif = get_time_dif(start_time) print("Time usage: ", time_dif) session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('Training and evaluating...') start_time = time.time() total_batch = 0 best_acc_val = 0.0 last_improved = 0 require_improvement = 1000 flag = False for epoch in range(config.num_epochs): print('EPOCH', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob) if total_batch % config.save_per_batch == 0: s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, x_val, y_val) if acc_val > best_acc_val: best_acc_val = acc_val last_improved = total_batch saver.save(sess=session, save_path=save_path) improved_str = '*' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) feed_dict[model.keep_prob] = config.dropout_keep_prob session.run(model.optim, feed_dict=feed_dict) total_batch += 1 if total_batch - last_improved > require_improvement: print("No optimization for a long time, auto-stopping...") flag = True break if flag: break session.close()
valid_x, valid_y = build_dataset("valid", word_dict, article_max_len, summary_max_len) valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x)) with tf.Session() as sess: print("Loading saved model...") model = Summodel(reversed_dict, article_max_len, summary_max_len, args, Forward_only=True) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state("./saved_model/") saver.restore(sess, ckpt.model_checkpoint_path) batches = batch_iter(valid_x, valid_y, args.batch_size, 1) print("Writing summaries to 'train/result.txt'...") for batch_x, batch_y in batches: batch_x_len = list( map(lambda x: len([y for y in x if y != 0]), batch_x)) valid_feed_dict = { model.batch_size: len(batch_x), model.X: batch_x, model.X_len: batch_x_len, } prediction = sess.run(model.prediction, feed_dict=valid_feed_dict) prediction_output = list( map(lambda x: [reversed_dict[y] for y in x if y > 0],
def train(model, x1_train, x2_train, y_train, x1_val, x2_val, y_val, x1_test, x2_test, y_test, args): # optimizer = optim.Adam(model.parameters()) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) """ optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.8, weight_decay=args.l2) """ loss_func = nn.BCELoss() batch_size = int(args.batch_size) # logger = Logger("./logs/quora") configure("./logs/quora", flush_secs=5) for epoch in range(args.epochs): model.train() running_losses = [] tmp_loss = 0 start_t = time.time() # Generate batch batches = data.batch_iter(list(zip(x1_train, x2_train, y_train)), batch_size) for batch in batches: x1_batch, x2_batch, y_batch = zip(*batch) x1_batch = Variable(torch.LongTensor(x1_batch)) x2_batch = Variable(torch.LongTensor(x2_batch)) y_batch = Variable(torch.FloatTensor(y_batch)) """ print("x1_batch.size:", x1_batch.size()) print("x2_batch.size:", x2_batch.size()) print("y_batch.size:", y_batch.size()) """ if torch.cuda.is_available(): x1_batch = x1_batch.cuda() x2_batch = x2_batch.cuda() y_batch = y_batch.cuda() preds, classes = model(x1_batch, x2_batch) # print("preds.size:", preds.size()) # Gradient descent optimizer.zero_grad() loss = loss_func(preds, y_batch) loss.backward() optimizer.step() tmp_loss += loss.item() running_losses.append(loss.item()) """ y_truth = y_batch.byte() print("classes:", classes.data) print("y_truth:", y_truth.data) print("y_batch:", y_batch.data) print("type(classes):", type(classes)) print("type(y_batch):", type(y_batch)) print("type(y_truth):", type(y_truth)) """ # epoch_loss = sum(running_losses) / len(running_losses) loss_train, acc_train = eval(model, x1_train, x2_train, y_train, batch_size) loss_val, acc_val = eval(model, x1_val, x2_val, y_val, batch_size) loss_test, acc_test = eval(model, x1_test, x2_test, y_test, batch_size) tmp_str = "Epoch: {}, loss_train: {:.4f}, acc_train: {:.4f}, " tmp_str += "loss_val: {:.4f}, acc_val: {:.4f}, " tmp_str += "loss_test: {:.4f}, acc_test: {:.4f}" result_str = tmp_str.format(epoch + 1, loss_train, acc_train, loss_val, acc_val, loss_test, acc_test) _write_elapsed_time(start_t=start_t, msg=result_str) # log scalar values log_info = { 'loss_train': loss_train, 'acc_train': acc_train, 'loss_dev': loss_val, 'acc_dev': acc_val, 'loss_test': loss_test, 'acc_test': acc_test } for tag, value in log_info.items(): # logger.scalar_summary(tag, value, epoch + 1) log_value(tag, value, epoch + 1) if (epoch + 1) % args.checkpoint_interval == 0: model_name = model.__class__.__name__ save(model, save_dir=args.save_dir, save_prefix="snapshot", model_name=model_name, steps=epoch + 1) return 0
config = get_tfconfig() sess = tf.Session(config=config) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name x = graph.get_operation_by_name("x").outputs[0] keep_prob = graph.get_operation_by_name("keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name("output/predict").outputs[0] # Generate batches for one epoch batches = batch_iter(list(x_test), BATCH_SIZE, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, {x: x_test_batch, keep_prob: 1.0}) all_predictions = np.concatenate([all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) # Save the evaluation to a csv
def train_and_dev(self, sess, x, y, seq_len, batch_size, test_x, test_y, test_seq_len, epoch_size, data=None): learning_rate = 1e-3 train_aux_xs = data.train_aux_xs train_aux_xs_len = data.train_aux_xs_len train_aux_len = data.train_aux_len test_aux_xs = data.dev_aux_xs test_aux_xs_len = data.dev_aux_xs_len test_aux_len = data.dev_aux_len best_acc = 0.0 try: for epoch in range(epoch_size): total_cost = 0.0 total_batch = 0 total_acc_num = 0 for batch_x, batch_y, batch_len, batch_xs, batch_xs_len, batch_aux_len in batch_iter( x, y, seq_len, batch_size, train_aux_xs, train_aux_xs_len, train_aux_len, shuffle=True): total_batch += 1 _, cost_val, acc_cnt = sess.run( [self.train_op, self.cost, self.acc_cnt], feed_dict={ self.in_x: batch_x, self.in_y: batch_y, self.in_len: batch_len, self.aux_xs: batch_xs, self.aux_xs_len: batch_xs_len, self.aux_len: batch_aux_len, self.learning_rate: learning_rate, self.dropout_rate: 0.0 }) total_acc_num += acc_cnt total_cost += cost_val if total_batch % 30 == 0: print('batch_%d cost_val: %0.5f' % (total_batch, cost_val)) print('Epoch:', '%02d' % (epoch + 1), 'cost_avg =', '%0.5f' % (total_cost / total_batch), 'acc: %0.5f' % (total_acc_num / (0.0 + len(x)))) if epoch < 4 and epoch % 2 == 1: learning_rate /= 10. print('drop learning rate, Epoch:{} - {}'.format( epoch + 1, learning_rate)) ### total_acc_num_test = 0 for batch_test_x, batch_test_y, batch_len, batch_xs, batch_xs_len, batch_aux_len in batch_iter( test_x, test_y, test_seq_len, 200, test_aux_xs, test_aux_xs_len, test_aux_len): acc_cnt_test = sess.run(self.acc_cnt, feed_dict={ self.in_x: batch_test_x, self.in_y: batch_test_y, self.in_len: batch_len, self.aux_xs: batch_xs, self.aux_xs_len: batch_xs_len, self.aux_len: batch_aux_len, self.dropout_rate: 0.0 }) total_acc_num_test += acc_cnt_test cur_acc = total_acc_num_test / (0.0 + len(test_x)) print('test acc: {:.5f}'.format(cur_acc)) if best_acc < cur_acc: best_acc = cur_acc self.saver.save(sess, config.save_dir + '/rcnn_saver.ckpt', global_step=epoch + 1) probs, preds = self.predict(sess, data, False) self.outputs(preds) evaluation.Evaluation( config.dev_file, config.dev_predict_file.format(self.params.model_name)) except Exception: print(Exception)
print("Building dictionary...") word_dict, reversed_dict, art_max_len, sum_max_len = build_dict('train') print("Loading training dataset...") train_x, train_y = build_dataset("train", word_dict, art_max_len, sum_max_len) with tf.Session() as sess: model = Summodel(reversed_dict, art_max_len, sum_max_len, args, Forward_only=False) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1 print("Number of batches per epoch :", num_batches_per_epoch) print("We are running!") for batch_x, batch_y in batches: batch_x_len = list( map(lambda x: len([y for y in x if y != 0]), batch_x)) batch_decoder_input = list( map(lambda x: [word_dict["<s>"]] + list(x), batch_y)) batch_decoder_len = list( map(lambda x: len([y for y in x if y != 0]), batch_decoder_input)) batch_decoder_output = list( map(lambda x: list(x) + [word_dict["</s>"]], batch_y)) batch_decoder_input = list( map(
def train(self): # 载入数据 x_data, x_labels = self.load_data.load_data( 'train', self.model_config.sequence_length) z_data, z_labels = self.load_data.load_data( 'val', self.model_config.sequence_length) self.model.train() optimizer = torch.optim.Adam(self.model.parameters(), lr=self.model_config.lr) total_batch = 0 val_best_loss = float("inf") last_improve = 0 flag = False for epoch in range(self.model_config.num_epochs): print("Epoch [{}/{}]".format(epoch + 1, self.model_config.num_epochs)) for i, (train_data, labels) in enumerate( batch_iter(x_data, x_labels, self.model_config.batch_size)): batch_input = torch.LongTensor(train_data).to(self.device) outputs = self.model(batch_input) self.model.zero_grad() batch_label = torch.LongTensor(labels).to(self.device) loss = F.cross_entropy(outputs, batch_label) loss.backward() optimizer.step() total_batch += 1 if total_batch % 100 == 0: true = torch.LongTensor(labels).data predict = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score( true, predict) # 100批次中最后一批次准确率 val_acc, val_loss = self.evaluate(z_data, z_labels) if val_loss < val_best_loss: val_best_loss = val_loss torch.save(self.model.state_dict(), self.model_config.model_save_path) improve = " *" last_improve = total_batch else: improve = "" msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}' + improve logger.info( msg.format(total_batch, loss.item(), train_acc, val_loss, val_acc)) self.writer.add_scalar("loss/train", loss.item(), total_batch) self.writer.add_scalar("loss/dev", val_loss, total_batch) self.writer.add_scalar("acc/train", train_acc, total_batch) self.writer.add_scalar("acc/dev", val_acc, total_batch) self.model.train() if total_batch - last_improve > self.model_config.require_improvement: logger.info( "No improvement for {} batches, I quit!!!".format( self.model_config.require_improvement)) flag = True break if flag: break self.test()
def train_and_test(sess, model, x_train, y_train, x_test, y_test, learning_rate, batch_size, num_epochs, dropout_keep_prob, out_dir, evaluate_every=100, checkpoint_every=100, num_checkpoints=5): print("Writing to {}\n".format(out_dir)) # Define training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(model.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", model.loss) acc_summary = tf.summary.scalar("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.summary.merge( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Test Summary Writer test_summary_dir = os.path.join(out_dir, "summaries", "test") test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph) # Checkpoint directory & saver checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints, save_relative_paths=True) # Initialize all variables sess.run(tf.global_variables_initializer()) def train_step(x_batch, y_batch): """ A single training step. """ feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.train_flag: True, model.dropout_keep_prob: dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, model.loss, model.accuracy ], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: Step {}, Loss {:g}, Accuracy {:g}".format( time_str, step, loss, accuracy)) train_summary_writer.add_summary(summaries, step) def test_step(x_test, y_test, writer=None): """ Evaluates model on a test set. """ # TODO: Hacky workaround to test model due to OOM errors / fixed batch size. step = 0 size = x_test.shape[0] losses = 0 predictions = np.empty(size) for begin in range(0, size, batch_size): end = begin + batch_size end = min([end, size]) x_batch = np.zeros((batch_size, x_test.shape[1])) x_batch[:end - begin] = x_test[begin:end] y_batch = np.zeros(batch_size) y_batch[:end - begin] = y_test[begin:end] feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.train_flag: False } step, batch_pred, batch_loss = sess.run( [global_step, model.predictions, model.loss], feed_dict) predictions[begin:end] = batch_pred[:end - begin] losses += batch_loss accuracy = sklearn.metrics.accuracy_score(y_test, predictions) loss = losses * batch_size / size time_str = datetime.datetime.now().isoformat() cur_epoch = step * batch_size / len(x_train) print( "{}: Step {}, Epoch {:.2f} / {}, Loss {:g}, Accuracy {:g}".format( time_str, step, cur_epoch, num_epochs, loss, accuracy)) summary = tf.Summary() summary.value.add(tag="loss_1", simple_value=loss) summary.value.add(tag="accuracy_1", simple_value=accuracy) if writer: writer.add_summary(summary, step) return accuracy # Generate batches batches = data.batch_iter(list(zip(x_train, y_train)), batch_size, num_epochs) # Maximum test accuracy max_accuracy = 0.0 # Training loop for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") accuracy = test_step(x_test, y_test, writer=test_summary_writer) if accuracy > max_accuracy: max_accuracy = accuracy print("Max. Test Accuracy: {:g}".format(max_accuracy)) print("") if current_step % checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path)) return max_accuracy
def predict(self, sess, data, restore=True, test=False): if restore: ckpt = tf.train.get_checkpoint_state(config.save_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): self.saver.restore(sess, ckpt.model_checkpoint_path) else: raise RuntimeError('no rcnn model ...') test_x = data.dev_x test_y = data.dev_y test_seq_len = data.dev_seq_len test_aux_xs = data.dev_aux_xs test_aux_xs_len = data.dev_aux_xs_len test_aux_len = data.dev_aux_len if test is True: test_x = data.test_x test_y = data.test_y test_seq_len = data.test_seq_len test_aux_xs = data.test_aux_xs test_aux_xs_len = data.test_aux_xs_len test_aux_len = data.test_aux_len probability = [] predicts = [] for batch_test_x, batch_test_y, batch_len, batch_xs, batch_xs_len, batch_aux_len in batch_iter( test_x, test_y, test_seq_len, 200, test_aux_xs, test_aux_xs_len, test_aux_len): proba, pred = sess.run( [self.y_prob, self.y_p], feed_dict={ self.in_x: batch_test_x, self.in_y: batch_test_y, self.in_len: batch_len, self.aux_xs: batch_xs, self.aux_xs_len: batch_xs_len, self.aux_len: batch_aux_len, self.dropout_rate: 0.0 }) probability.append(proba) predicts.append(pred) probability = np.array(probability).reshape((-1, self.class_num)) predicts = np.array(predicts).reshape((-1)) return probability, predicts
def coexpression_with_KO(genes, outloc, best_model, X_mRNA_test, X_promoter_test, Y_test): # Input # genes: genes for testing coexpression # outloc: a full path to a directory of the best model # best_model: name of the best model # X_mRNA_test: mRNA annoation data # X_promoter_test: Promoter annoatation data # Y_test: Transcriptome data # Output #.{outloc}/{best_model}/regulator_KO/coexpression.txt.gz: The first column and the second column indicates Spearman's correlation between genes and corresoponding p-value, respectively. 3rd column indicates indexes of RNA regulators knockouted, separated by commas. 4th column indicates indexes of DNA regulators knockouted, separated by commas. Regulator index corresponds to one in {outloc}/feature_norm_stats.txt. import layer_utils import metrics from keras.models import load_model import data import copy from scipy.stats import spearmanr import numpy as np import os if not os.path.exists(outloc + best_model + '/regulator_KO/'): os.makedirs(outloc + best_model + '/regulator_KO/') outfile_name = outloc + best_model + '/regulator_KO/coexpression.txt' # Load best model model = load_model(outloc + best_model + '_model.h5', custom_objects={ 'pcor': metrics.pcor, 'GlobalSumPooling1D': layer_utils.GlobalSumPooling1D }) # Batch data batch_size = 128 test_steps, test_batches = data.batch_iter( X_mRNA_test.query("Name in @genes").values[:, 1], X_promoter_test.query("Name in @genes").values[:, 1], Y_test.query("Name in @genes").values[:, 1:], batch_size, shuffle=False) X = test_batches.next() X_copy = copy.deepcopy(X) none_zero_indx0 = np.where( np.sum(np.sum(X[0][0], axis=1), axis=0) > 0)[0][1:] none_zero_indx1 = np.where( np.sum(np.sum(X[0][1], axis=1), axis=0) > 0)[0][1:] # None mutated result np.random.seed(seed=1234) indx0 = none_zero_indx0[ np.random.uniform(0, 1, len(none_zero_indx0)) < -0.5] indx1 = none_zero_indx1[ np.random.uniform(0, 1, len(none_zero_indx1)) < -0.5] # Predict expression Y_predicted = model.predict(X_copy[0]) # Compute correlation res = spearmanr(Y_predicted[0, 0:-1], Y_predicted[1, 0:-1]) # Save correlation with open(outfile_name, 'a') as f_handle: out_txt = str(res.correlation) + '\t' + str( res.pvalue) + '\t' + ','.join(map(str, indx0)) + '\t' + ','.join( map(str, indx1)) + '\n' f_handle.write(out_txt) # Simulate random removal of binding sites for i in range(10000): X_copy = copy.deepcopy(X) indx0 = none_zero_indx0[ np.random.uniform(0, 1, len(none_zero_indx0)) > 0.5] indx1 = none_zero_indx1[ np.random.uniform(0, 1, len(none_zero_indx1)) > 0.5] # Remove binding of regulators X_copy[0][0][0, :, indx0] = 0 X_copy[0][1][0, :, indx1] = 0 # Predict expression Y_predicted = model.predict(X_copy[0]) # Compute correlation res = spearmanr(Y_predicted[0, 0:-1], Y_predicted[1, 0:-1]) # Save correlation with open(outfile_name, 'a') as f_handle: out_txt = str( res.correlation) + '\t' + str(res.pvalue) + '\t' + ','.join( map(str, indx0)) + '\t' + ','.join(map(str, indx1)) + '\n' f_handle.write(out_txt) os.system("gzip " + outfile_name)
def coexpression_with_binding_site_removal(genes, outloc, best_model, X_mRNA_test, X_promoter_test, Y_test): # Input: # genes: genes for testing coexpression # outloc: a full path to a directory of the best model # best_model: name of the best model # X_mRNA_test: mRNA annoation data # X_promoter_test: Promoter annoatation data # Y_test: Transcriptome data # Output: #.{outloc}/{best_model}/binding_site_removal/coexpression.txt.gz: The 1st column indicates gene whose biniding sites were removed. The 2nd and the 3rd column indicates Spearman's correlation between genes and corresoponding p-value, respectively. The 4th column indicates indexes of RNA interval where biding sites were removed, separated by commas. The 5th column indicates indexes of promoter interval where biding sites were removed, separated by commas. import layer_utils import metrics from keras.models import load_model import data import copy from scipy.stats import spearmanr import numpy as np import os if not os.path.exists(outloc + best_model + '/binding_site_removal/'): os.makedirs(outloc + best_model + '/binding_site_removal/') outfile_name = outloc + best_model + '/binding_site_removal/coexpression.txt' # Load best model model = load_model(outloc + best_model + '_model.h5', custom_objects={ 'pcor': metrics.pcor, 'GlobalSumPooling1D': layer_utils.GlobalSumPooling1D }) # Batch data batch_size = 128 test_steps, test_batches = data.batch_iter( X_mRNA_test.query("Name in @genes").values[:, 1], X_promoter_test.query("Name in @genes").values[:, 1], Y_test.query("Name in @genes").values[:, 1:], batch_size, shuffle=False) X = test_batches.next() X_copy = copy.deepcopy(X) # None mutated result np.random.seed(seed=1234) indx0 = np.random.uniform(0, 1, X[0][0].shape[1]) < -0.5 indx1 = np.random.uniform(0, 1, X[0][1].shape[1]) < -0.5 # Predict expression Y_predicted = model.predict(X_copy[0]) # Compute correlation res = spearmanr(Y_predicted[0, 0:-1], Y_predicted[1, 0:-1]) # Save correlation with open(outfile_name, 'a') as f_handle: out_txt = "no_mutation" + '\t' + str( res.correlation) + '\t' + str(res.pvalue) + '\t' + ','.join( map(str, np.where(indx0)[0])) + '\t' + ','.join( map(str, np.where(indx1)[0])) + '\n' f_handle.write(out_txt) # Simulate random removal of binding sites for i in range(10000): for tr_id in range(2): gene_name = X_mRNA_test.query("Name in @genes").values[tr_id, 0] X_copy = copy.deepcopy(X) indx0 = np.random.uniform(0, 1, X[0][0].shape[1]) > 0.5 indx1 = np.random.uniform(0, 1, X[0][1].shape[1]) > 0.5 # Remove binding sites X_copy[0][0][tr_id, indx0, 1:] = 0 X_copy[0][1][tr_id, indx1, 1:] = 0 # Predict expression Y_predicted = model.predict(X_copy[0]) # Compute correlation res = spearmanr(Y_predicted[0, 0:-1], Y_predicted[1, 0:-1]) # Save correlation with open(outfile_name, 'a') as f_handle: out_txt = gene_name + '\t' + str(res.correlation) + '\t' + str( res.pvalue) + '\t' + ','.join(map( str, np.where(indx0)[0])) + '\t' + ','.join( map(str, np.where(indx1)[0])) + '\n' f_handle.write(out_txt) os.system("gzip " + outfile_name)
def test_prediction(outloc, best_model, X_mRNA_test, X_promoter_test, Y_test): # Inputs # outloc: a full path to a directory of the best model # best_model: name of the best model # X_mRNA_test: mRNA annoation data # X_promoter_test: Promoter annoatation data # Y_test: Transcriptome data # Outputs #.{outloc}/test_data/prediction.txt.gz: Predicted gene expression data #.{outloc}/test_data/actual.txt.gz: Actual gene expression data #.{outloc}/test_data/geneid.txt.gz: Genes in testing data. import layer_utils import metrics from keras.models import load_model import data import numpy as np import os if not os.path.exists(outloc + best_model + '/test_data/'): os.makedirs(outloc + "/" + best_model + '/test_data/') # Load best model model = load_model(outloc + best_model + '_model.h5', custom_objects={ 'pcor': metrics.pcor, 'GlobalSumPooling1D': layer_utils.GlobalSumPooling1D }) # Batching testing data batch_size = 128 test_steps, test_batches = data.batch_iter(X_mRNA_test.values[:, 1], X_promoter_test.values[:, 1], Y_test.values[:, 1:], batch_size, shuffle=False) # Making prediction pred = [] actu = [] for i in range(test_steps): a = test_batches.next() b = model.predict(a[0]) pred.append(b) actu.append(np.vstack(a[1])) pred = np.vstack(pred) actu = np.vstack(actu) # Save actual and predicted gene expression np.savetxt(outloc + best_model + '/test_data/actual.txt', actu, delimiter='\t') np.savetxt(outloc + best_model + '/test_data/prediction.txt', pred, delimiter='\t') X_mRNA_test['Name'].to_csv(outloc + best_model + '/test_data/geneid.txt', header=False, index=False, sep='\t') # gzip text files os.system("gzip " + outloc + best_model + '/test_data/actual.txt') os.system("gzip " + outloc + best_model + '/test_data/prediction.txt') os.system("gzip " + outloc + best_model + '/test_data/geneid.txt')
summary.value.add(tag="accuracy_1", simple_value=accuracy) if writer: writer.add_summary(summary, step) return accuracy # Convert sparse matrices to arrays # TODO: Is there a workaround for this? Doesn't seem memory efficient. # TODO: https://github.com/tensorflow/tensorflow/issues/342#issuecomment-160354041 # TODO: https://github.com/tensorflow/tensorflow/issues/342#issuecomment-273463729 # TODO: https://stackoverflow.com/questions/37001686/using-sparsetensor-as-a-trainable-variable x_train = np.squeeze([x_i.toarray() for x_i in x_train]) x_test = np.squeeze([x_i.toarray() for x_i in x_test]) # Generate batches batches = data.batch_iter(list(zip(x_train, y_train)), batch_size, num_epochs) # Maximum test accuracy max_accuracy = 0 # Training loop for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % evaluate_every == 0: print("\nEvaluation:") accuracy = test_step(x_test, y_test, writer=test_summary_writer) if accuracy > max_accuracy:
def train(x_train, y_train, vocab_processor, x_dev, y_dev, embedding_matrix): config = get_tfconfig() with tf.Graph().as_default(): with tf.Session(config=config) as sess: with sess.as_default(): cnn = CNN(x_train.shape[1], y_train.shape[1], len(vocab_processor.vocabulary_), EMBEDDING_DIM, list(map(int, FILTER_SIZES.split(","))), NUM_FILTERS, L2_REG_LAMBDA, embedding_matrix) gstep = tf.Variable(0, name="gstep", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) grads_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_vars, global_step=gstep) # output dictionary for models time_stamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", time_stamp)) print("writing to {}".format(out_dir)) # gradients summaries grad_summaries = [] accuracy_list = [] for g, v in grads_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) grad_summaries.append(grad_hist_summary) ## sparsity - 얘는 뭐하는 친구였을까요? sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) ## loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) ## train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) ## checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=NUM_CHECKPOINTS) vocab_processor.save(os.path.join(out_dir, "vocab")) sess.run(tf.global_variables_initializer()) def train_step(x_bat, y_bat): feed_dict = {cnn.x : x_bat, cnn.y : y_bat, cnn.keep_prob : KEEP_PROB} _,step, summaries, loss, acc = sess.run([train_op, gstep, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}. loss {}, acc {}".format(time_str, step, loss, acc)) train_summary_writer.add_summary(summaries, step) def dev_step(x_bat, y_bat, writer=None): feed_dict = {cnn.x : x_bat, cnn.y : y_bat, cnn.keep_prob : 1.0} step, summaries, loss, acc = sess.run([gstep, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}. loss {}, acc {}".format(time_str, step, loss, acc)) ## max_acc = max(accuracy_list) if len(accuracy_list) > 2 else -1000 if max_acc < acc: print("update {} -> {}, length {}".format(max_acc, acc, len(accuracy_list))) accuracy_list.append(acc) if writer: writer.add_summary(summaries, step) data = list(zip(x_train, y_train)) batches = batch_iter(data, BATCH_SIZE, NUM_EPOCHS) for batch in batches: x_bat, y_bat = zip(*batch) feed_dict = {cnn.x : x_bat, cnn.y : y_bat, cnn.keep_prob : KEEP_PROB} """ embedding_weight = sess.run([cnn.embedding_weight], feed_dict) print np.shape(embedding_weight) sys.exit(0) """ train_step(x_bat, y_bat) cur_step = tf.train.global_step(sess, gstep) if cur_step % EVAL_EVERY == 0: print("\nEvaluation : ") dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if cur_step % CHECKPOINT_EVERY == 0: path = saver.save(sess, checkpoint_prefix, global_step=cur_step) print("Saved model checkpoint to {}\n".format(path)) if len(accuracy_list) > 6: max_acc = max(accuracy_list) mm = 5 for idx, iacc in enumerate(accuracy_list[-5:]): if max_acc > iacc: mm-=1 if mm == 0: break
def train_and_dev(self, sess, x, y, seq_len, batch_size, test_x, test_y, test_seq_len, epoch_size, data=None): learning_rate = 1e-3 train_aux_xs = data.train_aux_xs train_aux_xs_len = data.train_aux_xs_len train_aux_len = data.train_aux_len test_aux_xs = data.dev_aux_xs test_aux_xs_len = data.dev_aux_xs_len test_aux_len = data.dev_aux_len for epoch in range(epoch_size): total_cost = 0.0 total_batch = 0 total_acc_num = 0 for batch_x, batch_y, batch_len, batch_xs, batch_xs_len, batch_aux_len in batch_iter(x, y, seq_len, batch_size, train_aux_xs, train_aux_xs_len, train_aux_len, shuffle=True): total_batch += 1 _, cost_val, acc_cnt = sess.run([self.train_op, self.cost, self.acc_cnt], feed_dict={self.in_x: batch_x, self.in_y: batch_y, self.in_len: batch_len, self.aux_xs:batch_xs, self.aux_xs_len:batch_xs_len, self.aux_len:batch_aux_len, self.learning_rate: learning_rate, self.dropout_rate:0.0}) total_acc_num += acc_cnt total_cost += cost_val if total_batch % 30 == 0: print('batch_%d cost_val: %0.5f' % (total_batch, cost_val)) print('Epoch:', '%02d' % (epoch + 1), 'cost_avg =', '%0.5f' % (total_cost / total_batch), 'acc: %0.5f' % (total_acc_num/(0.0+len(x)))) if epoch < 4 and epoch % 2 == 1: learning_rate /= 10. print('drop learning rate, Epoch:{} - {}'.format(epoch + 1, learning_rate)) # self.saver.save(sess, config.save_dir+'/rcnn_saver.ckpt', global_step=epoch+1) ### total_acc_num_test = 0 for batch_test_x, batch_test_y, batch_len, batch_xs, batch_xs_len, batch_aux_len in batch_iter(test_x, test_y, test_seq_len, 200, test_aux_xs, test_aux_xs_len, test_aux_len): acc_cnt_test = sess.run(self.acc_cnt, feed_dict={self.in_x: batch_test_x, self.in_y: batch_test_y, self.in_len: batch_len, self.aux_xs: batch_xs, self.aux_xs_len: batch_xs_len, self.aux_len:batch_aux_len, self.dropout_rate:0.0 }) total_acc_num_test += acc_cnt_test print('test acc: %0.5f' % (total_acc_num_test / (0.0 + len(test_x))))
time_str = datetime.datetime.now().isoformat() print('{}:step:{} , loss:{} , acc:{}'.format(time_str,step,loss,accuracy)) def dev_step(x_batch, y_batch): feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summeries, loss, accuracy = sess.run( [ global_step, train_summary_op, cnn.losses, cnn.accuracy],feed_dict=feed_dict ) time_str = datetime.datetime.now().isoformat() print('{}: step:{} , loss:{} , acc:{}'.format(time_str, step, loss, accuracy)) batchs = data.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) for batch in batchs: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess,global_step) if current_step % FLAGS.evaluate_every ==0: print('\n evaluate_every') dev_step(x_dev,y_dev) if current_step % FLAGS.checkpoint_every == 0: path = saver.save(sess,'./',global_step=current_step) print("Saved model checkpoint to {}\n".format(path))