def __init__(self, decoder, l2_regularization): self.decoder = decoder self.copy_target_plc = [tf.placeholder(tf.int64, shape=[None]) for _ in decoder.copynet_logits] self.copy_w_plc = [tf.placeholder(tf.float32, shape=[None]) for _ in decoder.copynet_logits] copy_costs_in_time = [tf.nn.sparse_softmax_cross_entropy_with_logits(l, t) * w \ for w, l, t in zip(self.copy_w_plc, decoder.copynet_logits, self.copy_target_plc)] copy_cost = sum([tf.reduce_sum(c) for c in copy_costs_in_time]) tf.scalar_summary('train_copy_cost', copy_cost, collections=["summary_train"]) tf.scalar_summary('val_copy_cost', copy_cost, collections=["summary_val"]) with tf.variable_scope("l2_regularization"): l2_value = sum([tf.reduce_sum(v ** 2) for v in tf.trainable_variables()]) if l2_regularization > 0: l2_cost = l2_regularization * l2_value else: l2_cost = 0.0 tf.scalar_summary('train_l2_cost', l2_value, collections=["summary_train"]) optimizer = tf.train.AdamOptimizer(1e-4) gradients = optimizer.compute_gradients(decoder.cost + copy_cost + l2_cost) #for (g, v) in gradients: # if g is not None: # tf.histogram_summary('gr_' + v.name, g, collections=["summary_gradients"]) self.optimize_op = optimizer.apply_gradients(gradients, global_step=decoder.learning_step) #self.summary_gradients = tf.merge_summary(tf.get_collection("summary_gradients")) self.summary_train = tf.merge_summary(tf.get_collection("summary_train")) self.summary_val = tf.merge_summary(tf.get_collection("summary_val"))
def train(self, eval_on_test=False): """ Train model and save it to file. Train model with given hidden layers. Training data is created by prepare_training_data(), which must be called before this function. """ tf.reset_default_graph() with tf.Session() as sess: feature_data = tf.placeholder("float", [None, self.num_predictors]) labels = tf.placeholder("float", [None, self.num_classes]) layers = [self.num_predictors] + self.hidden_layers + [self.num_classes] model = self.inference(feature_data, layers) cost, cost_summary_op = self.loss(model, labels) training_op = self.training(cost, learning_rate=0.0001) correct_prediction = tf.equal(tf.argmax(model, 1), tf.argmax(labels, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) # Merge all variable summaries and save the results to log file # summary_op = tf.merge_all_summaries() accuracy_op_train = tf.scalar_summary("Accuracy on Train", accuracy) summary_op_train = tf.merge_summary([cost_summary_op, accuracy_op_train]) if eval_on_test: accuracy_op_test = tf.scalar_summary("Accuracy on Test", accuracy) summary_op_test = tf.merge_summary([accuracy_op_test]) summary_writer = tf.train.SummaryWriter(self.log_dir + self.model_name, sess.graph) train_dict = { feature_data: self.training_predictors_tf.values, labels: self.training_classes_tf.values.reshape(len(self.training_classes_tf.values), self.num_classes)} if eval_on_test: test_dict = { feature_data: self.test_predictors_tf.values, labels: self.test_classes_tf.values.reshape(len(self.test_classes_tf.values), self.num_classes)} init = tf.initialize_all_variables() sess.run(init) for i in range(1, self.max_iteration): sess.run(training_op, feed_dict=train_dict) # Write summary to log if i % 100 == 0: summary_str = sess.run(summary_op_train, feed_dict=train_dict) summary_writer.add_summary(summary_str, i) if eval_on_test: summary_str = sess.run(summary_op_test, feed_dict=test_dict) summary_writer.add_summary(summary_str, i) summary_writer.flush() # Print current accuracy to console if i%5000 == 0: print (i, sess.run(accuracy, feed_dict=train_dict)) # Save trained parameters saver = tf.train.Saver() saver.save(sess, self.model_filename)
def summary(self): # Keep track of gradient values and sparsity (optional) grad_summaries = [] for grad, var in self.grads_and_vars: if grad is not None: grad_hist_summary = tf.histogram_summary(var.op.name + '/gradients/hist', grad) sparsity_summary = tf.scalar_summary(var.op.name + '/gradients/sparsity', tf.nn.zero_fraction(grad)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) print("Writing to %s\n" % config.out_dir) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", self.loss) acc_summary = tf.scalar_summary("accuracy", self.accuracy) # Train Summaries self.train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(config.out_dir, "summaries", "train") self.train_summary_writer = tf.train.SummaryWriter(train_summary_dir, self.sess.graph_def) # Dev summaries self.val_summary_op = tf.merge_summary([loss_summary, acc_summary]) val_summary_dir = os.path.join(config.out_dir, "summaries", "val") self.val_summary_writer = tf.train.SummaryWriter(val_summary_dir, self.sess.graph_def)
def create_summaries(self): tf.scalar_summary("eval_cost", self.eval_cost, collections=[EVAL_SUMMARIES_COLLECTION]) tf.scalar_summary("eval_accuracy", self.eval_accuracy, collections=[EVAL_SUMMARIES_COLLECTION]) self.summaries = tf.merge_summary( tf.get_collection(tf.GraphKeys.SUMMARIES)) self.eval_summaries = tf.merge_summary(tf.get_collection( EVAL_SUMMARIES_COLLECTION))
def define_summaries(self): '''Helper function for init_opt''' all_sum = {'g': [], 'd': [], 'hist': []} for k, v in self.log_vars: if k.startswith('g'): all_sum['g'].append(tf.scalar_summary(k, v)) elif k.startswith('d'): all_sum['d'].append(tf.scalar_summary(k, v)) elif k.startswith('hist'): all_sum['hist'].append(tf.histogram_summary(k, v)) self.g_sum = tf.merge_summary(all_sum['g']) self.d_sum = tf.merge_summary(all_sum['d']) self.hist_sum = tf.merge_summary(all_sum['hist'])
def __init__(self, args, test): self.test = test self.reward = 0 self.step_count = 0 self.loss = 0.0 self.loss_count = 0 self.games = 0 self.q_values = 0.0 self.q_count = 0 self.current_score = 0 self.max_score = -1000000000 self.min_score = 1000000000 self.recording_frequency = args.recording_frequency with tf.device('/cpu:0'): self.spg = tf.placeholder(tf.float32, shape=[], name="score_per_game") self.mean_q = tf.placeholder(tf.float32, shape=[]) self.total_gp = tf.placeholder(tf.float32, shape=[]) self.max_r = tf.placeholder(tf.float32, shape=[]) self.min_r = tf.placeholder(tf.float32, shape=[]) self.time = tf.placeholder(tf.float32, shape=[]) self.spg_summ = tf.scalar_summary('score_per_game', self.spg) self.q_summ = tf.scalar_summary('q_values', self.mean_q) self.gp_summ = tf.scalar_summary('steps_per_game', self.total_gp) self.max_summ = tf.scalar_summary('maximum_score', self.max_r) self.min_summ = tf.scalar_summary('minimum_score', self.min_r) self.time_summ = tf.scalar_summary('steps_per_second', self.time) if not test: self.mean_l = tf.placeholder(tf.float32, shape=[], name='loss') self.l_summ = tf.scalar_summary('loss', self.mean_l) self.summary_op = tf.merge_summary( [self.spg_summ, self.q_summ, self.gp_summ, self.l_summ, self.max_summ, self.min_summ, self.time_summ]) self.path = ( args.save_path + '/records/' + args.game + '/' + args.agent_type + '/' + args.agent_name + '/train') else: self.summary_op = tf.merge_summary( [self.spg_summ, self.q_summ, self.gp_summ, self.max_summ, self.min_summ, self.time_summ]) self.path = ( args.save_path + '/records/' + args.game + '/' + args.agent_type + '/' + args.agent_name + '/test') # self.summary_op = tf.merge_all_summaries() self.sess = tf.Session() self.summary_writer = tf.train.SummaryWriter(self.path) self.start_time = time.time()
def get_stat(): fields = ['loss', 'acc'] stat = {} for phase in data._PHASES: if phase == data._TRAIN: iteration = sum([len(file) for file in files[data._TRAIN]]) / _BATCH_SIZE elif phase == data._VAL: iteration = sum([len(file) for file in files[data._VAL]]) / _BATCH_SIZE raw_averages = {field: (net[field], util.moving_average(net[field], iteration)) for field in fields} display = {} display.update({'%s_raw' % field: raw_averages[field][0] for field in fields}) display.update({'%s_avg' % field: raw_averages[field][1] for field in fields}) summaries = [] summaries += [tf.scalar_summary('%s_%s_raw' % (data._NAME[phase], field), raw_averages[field][0]) for field in fields] summaries += [tf.scalar_summary('%s_%s_avg' % (data._NAME[phase], field), raw_averages[field][1]) for field in fields] summary = tf.merge_summary(summaries) stat[phase] = dict( iteration=iteration, display=display, summary=summary) return stat
def train(self, x_train, y_train, x_test, y_test, n_epoch=10): """Train the cnn.""" self.session = tf.Session() with self.session.as_default(): optimizer = tf.train.AdamOptimizer(1e-3) grad_vars = optimizer.compute_gradients(self.loss) train_op = optimizer.apply_gradients(grad_vars) # summaries acc_summary = tf.scalar_summary('accuracy', self.accuracy) loss_summary = tf.scalar_summary('loss', self.loss) summary_op = tf.merge_summary([acc_summary, loss_summary]) summary_dir = os.path.join('cnn_logs', 'summaries') summary_writer = tf.train.SummaryWriter(summary_dir, self.session.graph) # Init session self.session.run(tf.initialize_all_variables()) # Create the batch iterator batches = batch_iterator(list(zip(x_train, y_train)), 64, n_epoch) # Train loop i = 0 for batch in batches: x_batch, y_batch = zip(*batch) # train step feed_dict = {self.x: x_batch, self.y_: y_batch, self.keep_prob: 0.5} _, summaries, loss, accuracy = self.session.run([train_op, summary_op, self.loss, self.accuracy], feed_dict) time = datetime.datetime.now().isoformat() i += 1 print("%s : step %s || loss %s , acc %s" % (time, i, loss, accuracy)) summary_writer.add_summary(summaries, i) # Evaluation on test set every 100 steps if i % 100 == 0: print("\nEvaluation on test-set") feed_dict = {self.x: x_test, self.y_: y_test, self.keep_prob: 1.0} _, loss, accuracy = self.session.run([train_op, self.loss, self.accuracy], feed_dict) print("%s : step %s || loss %s , acc %s" % (time, i, loss, accuracy)) print("")
def full_model(data): output_logits, queue_updates = predictor(data) output_logits = output_logits[:, :SIG_LEN-1, :] output_mean = tf.argmax(output_logits, dimension=2) targets = data[:, 1:] quantized_targets = quantizer(targets, QUANT_LOWER, QUANT_UPPER, QUANT_LEVELS) with tf.name_scope('error'): batch_error = tf.reduce_mean(tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(output_logits, quantized_targets), reduction_indices=[1])) error_summary = tf.scalar_summary('training error', (running_error + batch_error)/(num_runs + 1.0)) output_plot = crappy_plot(output_mean, QUANT_LEVELS) target_plot = crappy_plot(quantized_targets, QUANT_LEVELS) M = tf.reduce_max(output_logits) m = tf.reduce_min(output_logits) scaled_logits = (output_logits-m)/(M-m) # image = draw_on(tf.transpose(scaled_logits, perm=[0, 2, 1])[:, :, :, None], target_plot, [1.0, 0.0, 0.0]) # Casting is to work around some stupid tf bug; shouldn't be necessary output_probs = tf.reshape(tf.cast(tf.nn.softmax(tf.reshape(tf.cast(output_logits, tf.float64), [-1, QUANT_LEVELS])), tf.float32), [-1, SIG_LEN-1, QUANT_LEVELS]) image = draw_on(tf.transpose(output_probs, perm=[0, 2, 1])[:, :, :, None], target_plot, [1.0, 0.0, 0.0]) # image = draw_on(1.0, target_plot, [1.0, 0.0, 0.0]) # The first 1.0 starts with a white canvas # image = draw_on(image, output_plot, [0.0, 0.0, 1.0]) sample_summary = tf.image_summary('posterior_sample', image, 5) summaries = tf.merge_summary([error_summary, sample_summary]) return output_mean, queue_updates, batch_error, batch_error, summaries #+ 0.1*weight_decay
def prepare_loss(self, entropy_beta): with tf.device(self._device), tf.name_scope(self.network_name): if self._continuous_mode: policy_loss, entropy, summaries = self._prepare_policy_loss_continuous(entropy_beta) else: policy_loss, entropy, summaries = self._prepare_policy_loss_discrete(entropy_beta) # R (input for value) self.r = tf.placeholder("float", [1],name="reward") # value loss (output) # (Learning rate for Critic is half of Actor's, so multiply by 0.5) value_loss = 0.5 * tf.nn.l2_loss(self.r - self.v) # gradienet of policy and value are summed up self.total_loss = policy_loss + value_loss # todo: unclear if i really need these l = [] l.extend(summaries) l += [tf.scalar_summary(["R"], self.r)] l += [tf.scalar_summary(["(R-V)"], self.td)] l += [tf.scalar_summary("V (loss eval)", tf.reduce_mean(self.v))] # tf.reshape(self.v, (1,)))] l += [tf.scalar_summary(["V (r-td)"], self.r - self.td)] l += [tf.scalar_summary("entropy", tf.reduce_mean(entropy))] # tf.reshape(entropy, (1,)))] l += [tf.scalar_summary("policy_loss", tf.reduce_mean(policy_loss))] # tf.reshape(policy_loss, (1,)))] # TODO: HACK: when we do batch mode, will want a histogram and ditch the reshape, most likely? l += [tf.scalar_summary("value_loss", value_loss)] self.loss_summary_op = tf.merge_summary(l)
def testMergeSummary(self): with self.test_session() as sess: const = tf.constant(10.0) summ1 = tf.histogram_summary("h", const, name="histo") summ2 = tf.scalar_summary("c", const, name="summ") merge = tf.merge_summary([summ1, summ2]) value = sess.run(merge) self.assertEqual([], merge.get_shape()) self.assertProtoEquals(""" value { tag: "h" histo { min: 10.0 max: 10.0 num: 1.0 sum: 10.0 sum_squares: 100.0 bucket_limit: 9.93809490288 bucket_limit: 10.9319043932 bucket_limit: 1.79769313486e+308 bucket: 0.0 bucket: 1.0 bucket: 0.0 } } value { tag: "c" simple_value: 10.0 } """, self._AsSummary(value))
def testSummaries(self): with self.cached_session() as s: var = tf.Variable([1, 2, 3], dtype=tf.float32) s.run(tf.initialize_all_variables()) x, y = np.meshgrid(np.linspace(-10, 10, 256), np.linspace(-10, 10, 256)) image = np.sin(x**2 + y**2) / np.sqrt(x**2 + y**2) * .5 + .5 image = image[None, :, :, None] # make a dummy sound freq = 440 # A = 440Hz sampling_frequency = 11000 audio = np.sin(2 * np.pi * np.linspace(0, 1, sampling_frequency) * freq) audio = audio[None, :, None] test_dir = tempfile.mkdtemp() # test summaries writer = tf.train.SummaryWriter(test_dir) summaries = [ tf.scalar_summary("scalar_var", var[0]), tf.scalar_summary("scalar_reduce_var", tf.reduce_sum(var)), tf.histogram_summary("var_histogram", var), tf.image_summary("sin_image", image), tf.audio_summary("sin_wave", audio, sampling_frequency), ] run_summaries = s.run(summaries) writer.add_summary(s.run(tf.merge_summary(inputs=run_summaries))) # This is redundant, but we want to be able to rewrite the command writer.add_summary(s.run(tf.merge_all_summaries())) writer.close() shutil.rmtree(test_dir)
def testCanBeCalledMultipleTimes(self): batch_size = 20 val_input_batch = [tf.zeros([2, 3, 4])] lbl_input_batch = tf.ones([], dtype=tf.int32) probs = np.array([0, 1, 0, 0, 0]) batches = tf.contrib.training.stratified_sample( val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs ) batches += tf.contrib.training.stratified_sample( val_input_batch, lbl_input_batch, probs, batch_size, init_probs=probs ) batches += tf.contrib.training.stratified_sample_unknown_dist( val_input_batch, lbl_input_batch, probs, batch_size ) batches += tf.contrib.training.stratified_sample_unknown_dist( val_input_batch, lbl_input_batch, probs, batch_size ) summary_op = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES)) with self.test_session() as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) sess.run(batches + (summary_op,)) coord.request_stop() coord.join(threads)
def __setup_ops(self): cross_entropy = -tf.reduce_sum(self.actual_class * tf.log(self.output)) self.summary = tf.scalar_summary(self.label, cross_entropy) self.train_op = tf.train.AdamOptimizer(0.0001).minimize(cross_entropy) self.merge_summaries = tf.merge_summary([self.summary]) correct_prediction = tf.equal(tf.argmax(self.output,1), tf.argmax(self.actual_class,1)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
def setup_summaries(sess, env_id, args): ROOT_LOG_DIR = constants.LOG_FILE #os.getcwd() + "/tf-log/" TODAY_LOG_DIR = ROOT_LOG_DIR + "/" + datetime.now().date().isoformat() LOG_DIR = TODAY_LOG_DIR + "/" + datetime.now().time().replace(second=0, microsecond=0).isoformat()[0:-3].replace(':', '.') LOG_DIR += " %s" % env_id #env.spec.id # args.gym_env LOG_DIR += " lr=%f" % args.initial_learning_rate LOG_DIR += " hs=%s" % args.hidden_sizes LOG_DIR += " lstms=%s " % args.lstm_sizes if len(args.tag) > 0: LOG_DIR += " -- %s" % args.tag score_input = tf.placeholder(tf.float32,name="score_input") score_input_avg = tf.placeholder(tf.float32,name="score_input_avg") score_smooth = tf.Variable(dtype=tf.float32, initial_value=0, name="score_avg") score_smooth_assign_op = tf.assign(score_smooth, score_input * 0.01 + score_smooth * 0.99) score_summary_op = [tf.merge_summary([ tf.scalar_summary("score", score_input), tf.scalar_summary("score_avg", score_input_avg), tf.scalar_summary("score_smooth", score_smooth), ]), score_smooth_assign_op] from collections import deque moving_avg_scores = deque(maxlen=100) # summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(LOG_DIR, sess.graph_def) print("logs written to: %s " % LOG_DIR) print("tensorboard --logdir=%s" % LOG_DIR) # v1 def _record_score_fn(sess, summary_writer, score, global_t): moving_avg_scores.append(score) score_avg = np.mean(moving_avg_scores) summary_str, _ = sess.run(score_summary_op, feed_dict={ score_input: score, score_input_avg: score_avg }) moving_avg_scores.append(score) # print "record_score_fn:", summary_str summary_writer.add_summary(summary_str, global_t) return summary_writer, _record_score_fn
def build_eval_graph(self): # Keep track of the totals while running through the batch data self.total_loss = tf.Variable(0.0, trainable=False, collections=[]) self.total_correct = tf.Variable(0.0, trainable=False, collections=[]) self.example_count = tf.Variable(0.0, trainable=False, collections=[]) # Calculates the means self.mean_loss = self.total_loss / self.example_count self.accuracy = self.total_correct / self.example_count # Operations to modify to the stateful variables inc_total_loss = self.total_loss.assign_add(self.model.total_loss) inc_total_correct = self.total_correct.assign_add( tf.reduce_sum(tf.cast(self.model.correct_predictions, "float"))) inc_example_count = self.example_count.assign_add(self.model.batch_size) # Operation to reset all the stateful vars. Should be called before starting a data set evaluation. with tf.control_dependencies( [self.total_loss.initializer, self.total_correct.initializer, self.example_count.initializer]): self.eval_reset = tf.no_op() # Operation to modify the stateful variables with data from one batch # Should be called for each batch in the evaluatin set with tf.control_dependencies([inc_total_loss, inc_total_correct, inc_example_count]): self.eval_step = tf.no_op() # Summaries summary_mean_loss = tf.scalar_summary("mean_loss", self.mean_loss) summary_acc = tf.scalar_summary("accuracy", self.accuracy) self.summaries = tf.merge_summary([summary_mean_loss, summary_acc])
def evaluate(dataset_path): """Evaluate model on Dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): train_dir = Path(FLAGS.checkpoint_dir) reference_shape = mio.import_pickle(train_dir / 'reference_shape.pkl') images, gt_truth, inits, _ = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False) mirrored_images, _, mirrored_inits, shapes = data_provider.batch_inputs( [dataset_path], reference_shape, batch_size=FLAGS.batch_size, is_training=False, mirror_image=True) print('Loading model...') # Build a Graph that computes the logits predictions from the # inference model. with tf.device(FLAGS.device): patch_shape = (FLAGS.patch_size, FLAGS.patch_size) pred, _, _ = mdm_model.model(images, inits, patch_shape=patch_shape) tf.get_variable_scope().reuse_variables() pred_mirrored, _, _ = mdm_model.model( mirrored_images, mirrored_inits, patch_shape=patch_shape) pred_images, = tf.py_func(utils.batch_draw_landmarks, [images, pred], [tf.float32]) gt_images, = tf.py_func(utils.batch_draw_landmarks, [images, gt_truth], [tf.float32]) summaries = [] summaries.append(tf.image_summary('images', tf.concat(2, [gt_images, pred_images]), max_images=5)) avg_pred = pred + tf.py_func(flip_predictions, (pred_mirrored, shapes), (tf.float32, ))[0] avg_pred /= 2. # Calculate predictions. norm_error = mdm_model.normalized_rmse(avg_pred, gt_truth) # Restore the moving average version of the learned variables for eval. variable_averages = tf.train.ExponentialMovingAverage( mdm_train.MOVING_AVERAGE_DECAY) variables_to_restore = variable_averages.variables_to_restore() saver = tf.train.Saver(variables_to_restore) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_summary(summaries) graph_def = tf.get_default_graph().as_graph_def() summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir, graph_def=graph_def) while True: _eval_once(saver, summary_writer, norm_error, summary_op) if FLAGS.run_once: break time.sleep(FLAGS.eval_interval_secs)
def main(graph_path, Model, stream, validstream, continue_training=False, start_model=None, start_ind=0, save_every=1): """Run a complete training session. Will load a saved model to continue training if provided. After every epoch the current model will be saved, and the tensorboard will graph new data. """ with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-Config.init_scale, Config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = Model(config=Config) tf.initialize_all_variables().run() saver = tf.train.Saver(max_to_keep=Config.num_models) if continue_training: print("Continuing training from saved model ",start_model) saver.restore(session,start_model) writer = tf.train.SummaryWriter(graph_path, max_queue=3) last3 = [] learning_rate = Config.learning_rate session.run(tf.assign(m.lr, learning_rate)) tol = 0.001 for i in range(start_ind, start_ind+Config.num_epochs): print("EPOCH: %s"%i) print("learning_rate: %s"%learning_rate) epoch_cost, median_cost, max_cost = m.run_epoch(session, stream.get_sents(), True) print("Total cost for EPOCH: %s"%i) print(epoch_cost) print("Median cost: %s"%median_cost) print("Max cost: %s"%max_cost) accuracy = m.run_epoch(session, validstream.get_sents(), False) print("accuracy: %s"%accuracy) summ1 = tf.scalar_summary("epoch_cost", tf.constant(epoch_cost)) summ2 = tf.scalar_summary("median_cost", tf.constant(median_cost)) summ3 = tf.scalar_summary("max_cost", tf.constant(max_cost)) summ4 = tf.scalar_summary("learning_rate", tf.constant(learning_rate)) summ5 = tf.scalar_summary("accuracy", tf.constant(accuracy)) merge = tf.merge_summary([summ1, summ2, summ3, summ4, summ5]) writer.add_summary(merge.eval(), i) if i % save_every == 0: saver.save(session, model_dir + 'saved-lstm-model', global_step=i) if len(last3) == 3: h = max(last3) if last3[2] == h: learning_rate = learning_rate/2 session.run(tf.assign(m.lr, learning_rate)) elif last3[1] == h: if (last3[1] - last3[2])/last3[1] < tol: learning_rate = learning_rate/2 session.run(tf.assign(m.lr, learning_rate)) else: if (h - min(last3))/h < tol: learning_rate = learning_rate/2 session.run(tf.assign(m.lr, learning_rate)) last3 = last3[1:] + [median_cost] elif len(last3) < 3: last3 = last3 + [median_cost] else: raise Exception
def record_summary(): w3_summary = tf.scalar_summary("weight 3", w3) w2_summary = tf.scalar_summary("weight 2", w2) w1_summary = tf.scalar_summary("weight 1", w1) w0_summary = tf.scalar_summary("weight 0", w0) loss_summary = tf.scalar_summary("loss", loss) m = tf.merge_summary([w3_summary, w2_summary, w1_summary, w0_summary, loss_summary]) return m
def setup_validation_summary(): acc = tf.placeholder(tf.float32) auc = tf.placeholder(tf.float32) valid_summaries = [ tf.summary.scalar('validation/acc', acc), tf.summary.scalar('validation/auc', auc) ] return tf.merge_summary(valid_summaries), acc, auc
def __init__(self, num_outputs, reuse=False, trainable=True): self.num_outputs = num_outputs # Placeholders for our input # Our input are 4 RGB frames of shape 160, 160 each self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") # The TD target value self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y") # Integer id of which action was selected self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") # Normalize X = tf.to_float(self.states) / 255.0 batch_size = tf.shape(self.states)[0] # Graph shared with Value Net with tf.variable_scope("shared", reuse=reuse): fc1 = build_shared_network(X, add_summaries=(not reuse)) with tf.variable_scope("policy_net"): self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None) self.probs = tf.nn.softmax(self.logits) + 1e-8 self.predictions = { "logits": self.logits, "probs": self.probs } # We add entropy to the loss to encourage exploration self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), 1, name="entropy") self.entropy_mean = tf.reduce_mean(self.entropy, name="entropy_mean") # Get the predictions for the chosen actions only gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions self.picked_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices) self.losses = - (tf.log(self.picked_action_probs) * self.targets + 0.01 * self.entropy) self.loss = tf.reduce_sum(self.losses, name="loss") tf.scalar_summary(self.loss.op.name, self.loss) tf.scalar_summary(self.entropy_mean.op.name, self.entropy_mean) tf.histogram_summary(self.entropy.op.name, self.entropy) if trainable: # self.optimizer = tf.train.AdamOptimizer(1e-4) self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None] self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step()) # Merge summaries from this network and the shared network (but not the value net) var_scope_name = tf.get_variable_scope().name summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES) sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name] sumaries = [s for s in summary_ops if var_scope_name in s.name] self.summaries = tf.merge_summary(sumaries)
def keep_tracking(grads_and_vars, cnn, sess): # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.pardir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", cnn.loss) acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.merge_summary( [loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.train.SummaryWriter( train_summary_dir, sess.graph_def) # Dev summaries dev_summary_op = tf.merge_summary([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.train.SummaryWriter( dev_summary_dir, sess.graph_def) # Checkpoint directory. Tensorflow assumes this directory # already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) return train_summary_op, train_summary_writer, dev_summary_op, \ dev_summary_dir, dev_summary_writer, checkpoint_prefix, saver
def merge_summary(summaries): """ Merge several summaries into one. :param summaries: Iterable of summaries. :return: An object that could be fed to :method:`SummaryWriter.add` """ summaries = flatten_list(maybe_iterable_to_list(summaries)) return tf.merge_summary(summaries)
def writeSummary(self): self.mergedSummary = tf.merge_summary([ self.s_loss, self.s_recon, self.s_l1, self.s_l1_mean, self.h_input, self.h_recon, self.h_v1_w, self.h_v1_a, self.h_normVals, self.s_errorStd, self.s_s_nnz ]) self.imageSummary = tf.merge_summary([ self.i_w, self.i_orig, self.i_recon ]) self.train_writer = tf.train.SummaryWriter(self.tfDir + "/train", self.sess.graph)
def add_summaries(self): """ Adds summaries for the following variables to the graph and returns an operation to evaluate them. * loss (raw) * loss (moving average) """ loss = tf.scalar_summary("loss (raw)", self.loss) return tf.merge_summary([loss])
def setup_validation_summary(): loss = tf.placeholder(tf.float32) acc = tf.placeholder(tf.float32) auc = tf.placeholder(tf.float32) valid_summaries = [ tf.summary.scalar("validation/loss", loss), tf.summary.scalar("validation/acc", acc), tf.summary.scalar("validation/auc", auc), ] return tf.merge_summary(valid_summaries), loss, acc, auc
def create_summaries(self): # Eval cost and accuracy self.eval_cost = tf.Variable(0.0, name='eval_cost', trainable=False) self.eval_accuracy = tf.Variable(0.0, name='eval_accuracy', trainable=False) tf.scalar_summary("eval_cost", self.eval_cost, collections=[train.EVAL_SUMMARIES_COLLECTION]) tf.scalar_summary("eval_accuracy", self.eval_accuracy, collections=[train.EVAL_SUMMARIES_COLLECTION]) # Images image = tf.reshape(self.model_vars['x'], (self.batcher.batch_size, 28, 28, 1)) tf.image_summary("mnist", image, max_images=10) self.summaries = tf.merge_summary( tf.get_collection(tf.GraphKeys.SUMMARIES)) self.eval_summaries = tf.merge_summary(tf.get_collection( train.EVAL_SUMMARIES_COLLECTION))
def visualization(self, n): fake_sum_train, superimage_train = \ self.visualize_one_superimage(self.fake_images[:n * n], self.images[:n * n], n, "train") fake_sum_test, superimage_test = \ self.visualize_one_superimage(self.fake_images[n * n:2 * n * n], self.images[n * n:2 * n * n], n, "test") self.superimages = tf.concat(0, [superimage_train, superimage_test]) self.image_summary = tf.merge_summary([fake_sum_train, fake_sum_test])
def __init__(self, reuse=False, trainable=True): # Placeholders for our input # Our input are 4 RGB frames of shape 160, 160 each self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") # The TD target value self.targets = tf.placeholder(shape=[None], dtype=tf.float32, name="y") X = tf.to_float(self.states) / 255.0 batch_size = tf.shape(self.states)[0] # Graph shared with Value Net with tf.variable_scope("shared", reuse=reuse): fc1 = build_shared_network(X, add_summaries=(not reuse)) with tf.variable_scope("value_net"): self.logits = tf.contrib.layers.fully_connected( inputs=fc1, num_outputs=1, activation_fn=None) self.logits = tf.squeeze(self.logits, squeeze_dims=[1], name="logits") self.losses = tf.squared_difference(self.logits, self.targets) self.loss = tf.reduce_sum(self.losses, name="loss") self.predictions = { "logits": self.logits } # Summaries prefix = tf.get_variable_scope().name tf.scalar_summary(self.loss.name, self.loss) tf.scalar_summary("{}/max_value".format(prefix), tf.reduce_max(self.logits)) tf.scalar_summary("{}/min_value".format(prefix), tf.reduce_min(self.logits)) tf.scalar_summary("{}/mean_value".format(prefix), tf.reduce_mean(self.logits)) tf.scalar_summary("{}/reward_max".format(prefix), tf.reduce_max(self.targets)) tf.scalar_summary("{}/reward_min".format(prefix), tf.reduce_min(self.targets)) tf.scalar_summary("{}/reward_mean".format(prefix), tf.reduce_mean(self.targets)) tf.histogram_summary("{}/reward_targets".format(prefix), self.targets) tf.histogram_summary("{}/values".format(prefix), self.logits) if trainable: # self.optimizer = tf.train.AdamOptimizer(1e-4) self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) self.grads_and_vars = self.optimizer.compute_gradients(self.loss) self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None] self.train_op = self.optimizer.apply_gradients(self.grads_and_vars, global_step=tf.contrib.framework.get_global_step()) var_scope_name = tf.get_variable_scope().name summary_ops = tf.get_collection(tf.GraphKeys.SUMMARIES) sumaries = [s for s in summary_ops if "policy_net" in s.name or "shared" in s.name] sumaries = [s for s in summary_ops if var_scope_name in s.name] self.summaries = tf.merge_summary(sumaries)
def bucket_net(bucket_size, stacked_lstm): # ---------------------------------------------------------------------------------------------- # Placeholders train_tokens = list() train_labels = list() for i in range(bucket_size-1): train_tokens.append(tf.placeholder(tf.int64, shape=[None], name='x_'+str(i))) train_labels.append(tf.placeholder(tf.float32, shape=[None, vocabulary_size], name='x_'+str(i+1))) embedding_inputs = list() for i in range(len(train_tokens)): embedding_inputs.append(tf.nn.embedding_lookup(embeddings, train_tokens[i])) visual_outputs = tf.placeholder(tf.float32, shape=[None, FLAGS.visual_dim], name='visual-embedding') # ---------------------------------------------------------------------------------------------- # Unrolled LSTM loop. outputs, final_state = tf.nn.rnn(stacked_lstm, embedding_inputs, dtype=tf.float32) final_state = tf.concat(1,[_state.c for _state in final_state]) if FLAGS.lstm_stacked_layers > 1 else final_state.c tf.get_variable_scope().reuse_variables() logits = tf.matmul(tf.concat(0, outputs), state2text_weight) + state2text_bias deep_prediction = tf.matmul(final_state, state2vis_weight) + state2vis_bias visual_prediction = tf.nn.relu(wide_prediction+deep_prediction) if FLAGS.dowide else tf.nn.relu(deep_prediction) # ---------------------------------------------------------------------------------------------- # Losses lstm_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.concat(0, train_labels))) visual_loss = tf.reduce_mean(tf.square(visual_prediction - visual_outputs)) loss = lstm_loss + visual_loss # ---------------------------------------------------------------------------------------------- # Tensorboard data: loss summaries if FLAGS.boarddata: lstm_loss_summary = tf.scalar_summary('loss/lstm_loss', lstm_loss) visual_loss_summary = tf.scalar_summary('loss/visual_loss', visual_loss) loss_summary = tf.scalar_summary('loss/loss', loss) summaries = tf.merge_summary([loss_summary, lstm_loss_summary, visual_loss_summary]) else: summaries = None #---------------------------------------------------------------------------------------------- # Optimizer. def optimizer(someloss): global_step = tf.Variable(0) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) gradients, v = zip(*optimizer.compute_gradients(someloss)) gradients, _ = tf.clip_by_global_norm(gradients, 1.25) optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) return optimizer return Model(bucket_size, train_tokens, train_labels, visual_outputs, visual_prediction, loss, lstm_loss, visual_loss, optimizer(loss), optimizer(lstm_loss), optimizer(visual_loss), summaries)
def define_model(self): ''' 定义我的的计算图谱 ''' def model(data_flow, train=True): ''' @data: original inputs @return: logits ''' # Define Convolutional Layers for i, (weights, biases, config) in enumerate(zip(self.conv_weights, self.conv_biases, self.conv_config)): with tf.name_scope(config['name'] + '_model'): with tf.name_scope('convolution'): # default 1,1,1,1 stride and SAME padding data_flow = tf.nn.conv2d(data_flow, filter=weights, strides=[1, 1, 1, 1], padding='SAME') data_flow = data_flow + biases if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1), name=config['name']+'_conv') if config['activation'] == 'relu': data_flow = tf.nn.relu(data_flow) if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1), name=config['name']+'_relu') else: raise Exception('Activation Func can only be Relu right now. You passed', config['activation']) if config['pooling']: data_flow = tf.nn.max_pool( data_flow, ksize=[1, self.pooling_scale, self.pooling_scale, 1], strides=[1, self.pooling_stride, self.pooling_stride, 1], padding='SAME') if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1)//2, name=config['name']+'_pooling') # Define Fully Connected Layers for i, (weights, biases, config) in enumerate(zip(self.fc_weights, self.fc_biases, self.fc_config)): if i == 0: shape = data_flow.get_shape().as_list() data_flow = tf.reshape(data_flow, [shape[0], shape[1] * shape[2] * shape[3]]) with tf.name_scope(config['name'] + 'model'): ### Dropout if train and i == len(self.fc_weights) - 1: data_flow = tf.nn.dropout(data_flow, self.dropout_rate, seed=4926) ### data_flow = tf.matmul(data_flow, weights) + biases if config['activation'] == 'relu': data_flow = tf.nn.relu(data_flow) elif config['activation'] is None: pass else: raise Exception('Activation Func can only be Relu or None right now. You passed', config['activation']) return data_flow # Training computation. logits = model(self.tf_train_samples) with tf.name_scope('loss'): self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.tf_train_labels)) self.loss += self.apply_regularization(_lambda=5e-4) self.train_summaries.append(tf.scalar_summary('Loss', self.loss)) # learning rate decay global_step = tf.Variable(0) learning_rate = tf.train.exponential_decay( learning_rate=self.base_learning_rate, global_step=global_step*self.train_batch_size, decay_steps=100, decay_rate=self.decay_rate, staircase=True ) # Optimizer. with tf.name_scope('optimizer'): if(self.optimizeMethod=='gradient'): self.optimizer = tf.train \ .GradientDescentOptimizer(learning_rate) \ .minimize(self.loss) elif(self.optimizeMethod=='momentum'): self.optimizer = tf.train \ .MomentumOptimizer(learning_rate, 0.5) \ .minimize(self.loss) elif(self.optimizeMethod=='adam'): self.optimizer = tf.train \ .AdamOptimizer(learning_rate) \ .minimize(self.loss) # Predictions for the training, validation, and test data. with tf.name_scope('train'): self.train_prediction = tf.nn.softmax(logits, name='train_prediction') tf.add_to_collection("prediction", self.train_prediction) with tf.name_scope('test'): self.test_prediction = tf.nn.softmax(model(self.tf_test_samples, train=False), name='test_prediction') tf.add_to_collection("prediction", self.test_prediction) single_shape = (1, 32, 32, 1) single_input = tf.placeholder(tf.float32, shape=single_shape, name='single_input') self.single_prediction = tf.nn.softmax(model(single_input, train=False), name='single_prediction') tf.add_to_collection("prediction", self.single_prediction) self.merged_train_summary = tf.merge_summary(self.train_summaries) self.merged_test_summary = tf.merge_summary(self.test_summaries) # 放在定义Graph之后,保存这张计算图 self.saver = tf.train.Saver(tf.all_variables())
def train(self, config): data = glob(os.path.join(config.dataset, "*.png")) #np.random.shuffle(data) assert (len(data) > 0) d_optim = tf.train.AdamOptimizer(config.learning_rate, beta1=config.beta1) \ .minimize(self.d_loss, var_list=self.d_vars) g_optim = tf.train.AdamOptimizer(config.learning_rate, beta1=config.beta1) \ .minimize(self.g_loss, var_list=self.g_vars) tf.initialize_all_variables().run() self.saver = tf.train.Saver() self.g_sum = tf.merge_summary([ self.z_sum, self.d__sum, self.G_sum, self.d_loss_fake_sum, self.g_loss_sum ]) self.d_sum = tf.merge_summary( [self.z_sum, self.d_sum, self.d_loss_real_sum, self.d_loss_sum]) self.writer = tf.train.SummaryWriter("./logs", self.sess.graph) sample_z = np.random.uniform(-1, 1, size=(self.sample_size, self.z_dim)) sample_files = data[0:self.sample_size] sample = [ get_image(sample_file, self.image_size, is_crop=self.is_crop) for sample_file in sample_files ] sample_images = np.array(sample).astype(np.float32) counter = 1 start_time = time.time() if self.load(self.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") for epoch in xrange(config.epoch): data = glob(os.path.join(config.dataset, "*.png")) batch_idxs = min(len(data), config.train_size) // self.batch_size for idx in xrange(0, batch_idxs): batch_files = data[idx * config.batch_size:(idx + 1) * config.batch_size] batch = [ get_image(batch_file, self.image_size, is_crop=self.is_crop) for batch_file in batch_files ] batch_images = np.array(batch).astype(np.float32) batch_z = np.random.uniform(-1, 1, [config.batch_size, self.z_dim]) \ .astype(np.float32) # Update D network _, summary_str = self.sess.run([d_optim, self.d_sum], feed_dict={ self.images: batch_images, self.z: batch_z }) self.writer.add_summary(summary_str, counter) # Update G network _, summary_str = self.sess.run([g_optim, self.g_sum], feed_dict={self.z: batch_z}) self.writer.add_summary(summary_str, counter) # Run g_optim twice to make sure that d_loss does not go to zero (different from paper) _, summary_str = self.sess.run([g_optim, self.g_sum], feed_dict={self.z: batch_z}) self.writer.add_summary(summary_str, counter) errD_fake = self.d_loss_fake.eval({self.z: batch_z}) errD_real = self.d_loss_real.eval({self.images: batch_images}) errG = self.g_loss.eval({self.z: batch_z}) counter += 1 print("Epoch: [%2d] [%4d/%4d] time: %4.4f, d_loss: %.8f, g_loss: %.8f" \ % (epoch, idx, batch_idxs, time.time() - start_time, errD_fake+errD_real, errG)) if np.mod(counter, 100) == 1: samples, d_loss, g_loss = self.sess.run( [self.sampler, self.d_loss, self.g_loss], feed_dict={ self.z: sample_z, self.images: sample_images }) save_images( samples, [8, 8], './samples/train_{:02d}_{:04d}.png'.format(epoch, idx)) print("[Sample] d_loss: %.8f, g_loss: %.8f" % (d_loss, g_loss)) if np.mod(counter, 500) == 2: self.save(config.checkpoint_dir, counter)
def __init__(self, config, is_training=True): self.keep_prob = config.keep_prob self.batch_size = tf.Variable(0, dtype=tf.int32, trainable=False) num_step = config.num_step self.input_data = tf.placeholder(tf.int32, [None, num_step]) self.target = tf.placeholder(tf.int64, [None]) self.mask_x = tf.placeholder(tf.float32, [num_step, None]) class_num = config.class_num hidden_neural_size = config.hidden_neural_size vocabulary_size = config.vocabulary_size embed_dim = config.embed_dim hidden_layer_num = config.hidden_layer_num self.new_batch_size = tf.placeholder(tf.int32, shape=[], name="new_batch_size") self._batch_size_update = tf.assign(self.batch_size, self.new_batch_size) #build LSTM network lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_neural_size, forget_bias=0.0, state_is_tuple=True) if self.keep_prob < 1: lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=self.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * hidden_layer_num, state_is_tuple=True) self._initial_state = cell.zero_state(self.batch_size, dtype=tf.float32) #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): embedding = tf.get_variable("embedding", [vocabulary_size, embed_dim], dtype=tf.float32) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if self.keep_prob < 1: inputs = tf.nn.dropout(inputs, self.keep_prob) out_put = [] state = self._initial_state with tf.variable_scope("LSTM_layer"): for time_step in range(num_step): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) out_put.append(cell_output) out_put = out_put * self.mask_x[:, :, None] with tf.name_scope("mean_pooling_layer"): out_put = tf.reduce_sum(out_put, 0) / (tf.reduce_sum( self.mask_x, 0)[:, None]) with tf.name_scope("Softmax_layer_and_output"): softmax_w = tf.get_variable("softmax_w", [hidden_neural_size, class_num], dtype=tf.float32) softmax_b = tf.get_variable("softmax_b", [class_num], dtype=tf.float32) self.logits = tf.matmul(out_put, softmax_w) + softmax_b with tf.name_scope("loss"): self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits + 1e-10, self.target) self.cost = tf.reduce_mean(self.loss) with tf.name_scope("accuracy"): self.prediction = tf.argmax(self.logits, 1) correct_prediction = tf.equal(self.prediction, self.target) self.correct_num = tf.reduce_sum( tf.cast(correct_prediction, tf.float32)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") #add summary loss_summary = tf.scalar_summary("loss", self.cost) #add summary accuracy_summary = tf.scalar_summary("accuracy_summary", self.accuracy) if not is_training: return self.globle_step = tf.Variable(0, name="globle_step", trainable=False) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), config.max_grad_norm) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.histogram_summary( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) self.grad_summaries_merged = tf.merge_summary(grad_summaries) self.summary = tf.merge_summary( [loss_summary, accuracy_summary, self.grad_summaries_merged]) optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer.apply_gradients(zip(grads, tvars)) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr)
grads_and_vars = optimizer.compute_gradients(siameseModel.loss) tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step) print("defined training_ops") # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) print("defined gradient summaries") # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables(), max_to_keep=100) # Write vocabulary vocab_processor.save(os.path.join(checkpoint_dir, "vocab"))
summeries = [] gradients = optimizer.compute_gradients(loss) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_value(grad, -10, 10), var) for (grad, var) in gradients: if grad is not None: summeries.append( tf.histogram_summary(var.name + '/grad', grad)) apply_gradients = optimizer.apply_gradients(gradients) summeries.append(tf.scalar_summary("Loss", loss)) summerize_op = tf.merge_summary(summeries) no_summerize = tf.no_op() llprint("Done!\n") llprint("Initializing Variables ... ") session.run(tf.initialize_all_variables()) llprint("Done!\n") if from_checkpoint is not None: llprint("Restoring Checkpoint %s ... " % (from_checkpoint)) ncomputer.restore(session, ckpts_dir, from_checkpoint) llprint("Done!\n") last_100_losses = []
def initialize(self): params = self.params sess = self.sess device_type = params.device_type summaries = [] global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['global_step'] = global_step epoch = tf.get_variable('epoch', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) self.tensors['epoch'] = epoch learning_rate = tf.placeholder('float32', name='learning_rate') summaries.append(tf.scalar_summary("learning_rate", learning_rate)) self.placeholders['learning_rate'] = learning_rate if params.opt == 'basic': opt = tf.train.GradientDescentOptimizer(learning_rate) elif params.opt == 'adagrad': opt = tf.train.AdagradOptimizer(learning_rate) elif params.opt == 'adam': opt = tf.train.AdamOptimizer() else: raise Exception() grads_pairs_dict = defaultdict(list) correct_tensors = [] loss_tensors = [] with tf.variable_scope("towers"): for device_id, tower in enumerate(self.towers): with tf.device("/%s:%d" % (device_type, device_id)), tf.name_scope( "%s_%d" % (device_type, device_id)): tower.initialize() tf.get_variable_scope().reuse_variables() loss_tensor = tower.get_loss_tensor() loss_tensors.append(loss_tensor) correct_tensor = tower.get_correct_tensor() correct_tensors.append(correct_tensor) self.tensors['correct_'], self.tensors[ 'mask_'], self.tensors['y_mask'], self.tensors[ 'y'] = tower.get_debug_tensor() for key, variables in tower.variables_dict.items(): grads_pair = opt.compute_gradients(loss_tensor, var_list=variables) grads_pairs_dict[key].append(grads_pair) with tf.name_scope("gpu_sync"): loss_tensor = tf.reduce_mean(tf.pack(loss_tensors), 0, name='loss') correct_tensor = tf.concat(0, correct_tensors, name="correct") with tf.name_scope("average_gradients"): grads_pair_dict = { key: average_gradients(grads_pairs) for key, grads_pairs in grads_pairs_dict.items() } if params.max_grad_norm: grads_pair_dict = { key: [(tf.clip_by_norm(grad, params.max_grad_norm), var) for grad, var in grads_pair] for key, grads_pair in grads_pair_dict.items() } self.tensors['loss'] = loss_tensor self.tensors['correct'] = correct_tensor summaries.append(tf.scalar_summary(loss_tensor.op.name, loss_tensor)) for key, grads_pair in grads_pair_dict.items(): for grad, var in grads_pair: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients/' + key, grad)) for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) apply_grads_op_dict = { key: opt.apply_gradients(grads_pair, global_step=global_step) for key, grads_pair in grads_pair_dict.items() } self.train_ops = { key: tf.group(apply_grads_op) for key, apply_grads_op in apply_grads_op_dict.items() } saver = tf.train.Saver(tf.all_variables(), max_to_keep=5) self.saver = saver summary_op = tf.merge_summary(summaries) self.tensors['summary'] = summary_op init_op = tf.initialize_all_variables() sess.run(init_op) if self.write_log: self.writer = tf.train.SummaryWriter(params.log_dir, sess.graph) self.initialized = True
def merge_summaries(self): self.summarize = tf.merge_summary( [self.loss_summary + self.entropy_summary] + self.histogram_summaries + self.weight_summaries)
def main_unsupervised(): with tf.Graph().as_default() as g: sess = tf.Session() num_hidden = FLAGS.num_hidden_layers ae_hidden_shapes = [ getattr(FLAGS, "hidden{0}_units".format(j + 1)) for j in xrange(num_hidden) ] ae_shape = [FLAGS.image_pixels ] + ae_hidden_shapes + [FLAGS.num_classes] ae = AutoEncoder(ae_shape, sess) data = read_data_sets_pretraining(FLAGS.data_dir) num_train = data.train.num_examples learning_rates = { j: getattr(FLAGS, "pre_layer{0}_learning_rate".format(j + 1)) for j in xrange(num_hidden) } noise = { j: getattr(FLAGS, "noise_{0}".format(j + 1)) for j in xrange(num_hidden) } for i in xrange(len(ae_shape) - 2): n = i + 1 with tf.variable_scope("pretrain_{0}".format(n)): input_ = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, ae_shape[0]), name='ae_input_pl') target_ = tf.placeholder(dtype=tf.float32, shape=(FLAGS.batch_size, ae_shape[0]), name='ae_target_pl') layer = ae.pretrain_net(input_, n) with tf.name_scope("target"): target_for_loss = ae.pretrain_net(target_, n, is_target=True) loss = loss_x_entropy(layer, target_for_loss) train_op, global_step = training(loss, learning_rates[i], i) summary_dir = pjoin(FLAGS.summary_dir, 'pretraining_{0}'.format(n)) summary_writer = tf.train.SummaryWriter( summary_dir, graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs) summary_vars = [ ae["biases{0}".format(n)], ae["weights{0}".format(n)] ] hist_summarries = [ tf.histogram_summary(v.op.name, v) for v in summary_vars ] hist_summarries.append(loss_summaries[i]) summary_op = tf.merge_summary(hist_summarries) vars_to_init = ae.get_variables_to_init(n) vars_to_init.append(global_step) sess.run(tf.initialize_variables(vars_to_init)) print("\n\n") print("| Training Step | Cross Entropy | Layer | Epoch |") print("|---------------|---------------|---------|----------|") for step in xrange(FLAGS.pretraining_epochs * num_train): feed_dict = fill_feed_dict_ae(data.train, input_, target_, noise[i]) loss_summary, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) if step % 100 == 0: summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) image_summary_op = \ tf.image_summary("training_images", tf.reshape(input_, (FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 1)), max_images=FLAGS.batch_size) summary_img_str = sess.run(image_summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_img_str) output = "| {0:>13} | {1:13.4f} | Layer {2} | Epoch {3} |"\ .format(step, loss_value, n, step // num_train + 1) print(output) if i == 0: filters = sess.run(tf.identity(ae["weights1"])) np.save(pjoin(FLAGS.chkpt_dir, "filters"), filters) filters = tile_raster_images(X=filters.T, img_shape=(FLAGS.image_size, FLAGS.image_size), tile_shape=(10, 10), output_pixel_vals=False) filters = np.expand_dims(np.expand_dims(filters, 0), 3) image_var = tf.Variable(filters) image_filter = tf.identity(image_var) sess.run(tf.initialize_variables([image_var])) img_filter_summary_op = tf.image_summary( "first_layer_filters", image_filter) summary_writer.add_summary(sess.run(img_filter_summary_op)) summary_writer.flush() return ae
def main_supervised(ae): with ae.session.graph.as_default(): sess = ae.session input_pl = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.image_pixels), name='input_pl') logits = ae.supervised_net(input_pl) data = read_data_sets(FLAGS.data_dir) num_train = data.train.num_examples labels_placeholder = tf.placeholder(tf.int32, shape=FLAGS.batch_size, name='target_pl') loss = loss_supervised(logits, labels_placeholder) train_op, global_step = training(loss, FLAGS.supervised_learning_rate) eval_correct = evaluation(logits, labels_placeholder) hist_summaries = [ ae['biases{0}'.format(i + 1)] for i in xrange(ae.num_hidden_layers + 1) ] hist_summaries.extend([ ae['weights{0}'.format(i + 1)] for i in xrange(ae.num_hidden_layers + 1) ]) hist_summaries = [ tf.histogram_summary(v.op.name + "_fine_tuning", v) for v in hist_summaries ] summary_op = tf.merge_summary(hist_summaries) summary_writer = tf.train.SummaryWriter(pjoin(FLAGS.summary_dir, 'fine_tuning'), graph_def=sess.graph_def, flush_secs=FLAGS.flush_secs) vars_to_init = ae.get_variables_to_init(ae.num_hidden_layers + 1) vars_to_init.append(global_step) sess.run(tf.initialize_variables(vars_to_init)) steps = FLAGS.finetuning_epochs * num_train for step in xrange(steps): start_time = time.time() feed_dict = fill_feed_dict(data.train, input_pl, labels_placeholder) _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % 100 == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_img_str = sess.run(tf.image_summary( "training_images", tf.reshape(input_pl, (FLAGS.batch_size, FLAGS.image_size, FLAGS.image_size, 1)), max_images=FLAGS.batch_size), feed_dict=feed_dict) summary_writer.add_summary(summary_img_str) if (step + 1) % 1000 == 0 or (step + 1) == steps: train_sum = do_eval_summary("training_error", sess, eval_correct, input_pl, labels_placeholder, data.train) val_sum = do_eval_summary("validation_error", sess, eval_correct, input_pl, labels_placeholder, data.validation) test_sum = do_eval_summary("test_error", sess, eval_correct, input_pl, labels_placeholder, data.test) do_eval(sess, eval_correct, input_pl, labels_placeholder, data.test) summary_writer.add_summary(train_sum, step) summary_writer.add_summary(val_sum, step) summary_writer.add_summary(test_sum, step)
def hist_summaries(*args): return tf.merge_summary([tf.histogram_summary(t.name,t) for t in args])
def __init__(self, vocab_size, hidden_size, dropout, num_layers, max_gradient_norm, max_seq_length, learning_rate, lr_decay, batch_size, forward_only=False): self.num_classes = 6507 self.vocab_size = vocab_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * lr_decay) initializer = tf.random_uniform_initializer(-1, 1) self.batch_pointer = 0 self.seq_input = [] self.batch_size = batch_size self.seq_lengths = [] self.projection_dim = hidden_size self.dropout = dropout self.max_gradient_norm = max_gradient_norm self.global_step = tf.Variable(0, trainable=False) self.max_seq_length = max_seq_length self.representation = [] #seq_input: list of tensors, each tensor is size max_seq_length #target: a list of values betweeen 0 and 1 indicating target scores #seq_lengths:the early stop lengths of each input tensor self.str_summary_type = tf.placeholder(tf.string, name="str_summary_type") self.seq_input = tf.placeholder(tf.int32, shape=[None, max_seq_length], name="input") self.target = tf.placeholder(tf.float32, name="target", shape=[None, self.num_classes]) self.seq_lengths = tf.placeholder(tf.int32, shape=[None], name="early_stop") self.dropout_keep_prob_embedding = tf.constant(self.dropout) self.dropout_keep_prob_lstm_input = tf.constant(self.dropout) self.dropout_keep_prob_lstm_output = tf.constant(self.dropout) with tf.variable_scope("embedding"), tf.device("/cpu:0"): W = tf.get_variable("W", [self.vocab_size, hidden_size], initializer=tf.random_uniform_initializer( -1.0, 1.0)) embedded_tokens = tf.nn.embedding_lookup(W, self.seq_input) embedded_tokens_drop = tf.nn.dropout( embedded_tokens, self.dropout_keep_prob_embedding) rnn_input = [ embedded_tokens_drop[:, i, :] for i in range(self.max_seq_length) ] with tf.variable_scope("lstm") as scope: single_cell = rnn_cell.DropoutWrapper( rnn_cell.LSTMCell(hidden_size, initializer=tf.random_uniform_initializer( -1.0, 1.0), state_is_tuple=True), input_keep_prob=self.dropout_keep_prob_lstm_input, output_keep_prob=self.dropout_keep_prob_lstm_output) cell = rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True) initial_state = cell.zero_state(self.batch_size, tf.float32) rnn_output, rnn_state = rnn.rnn(cell, rnn_input, initial_state=initial_state, sequence_length=self.seq_lengths) states_list = [] for state in rnn_state[-1]: states_list.append(state) avg_states = tf.reduce_mean(tf.pack(states_list), 0) self.representation.append(rnn_state[-1][0]) with tf.variable_scope("output_projection"): W = tf.get_variable( "W", [hidden_size, self.num_classes], initializer=tf.truncated_normal_initializer(stddev=0.1)) b = tf.get_variable("b", [self.num_classes], initializer=tf.constant_initializer(0.1)) self.scores = tf.nn.xw_plus_b(rnn_state[-1][0], W, b) self.y = tf.nn.softmax(self.scores) self.predictions = tf.argmax(self.scores, 1) with tf.variable_scope("loss"): self.losses = tf.nn.softmax_cross_entropy_with_logits( self.scores, self.target, name="ce_losses") self.total_loss = tf.reduce_sum(self.losses) self.mean_loss = tf.reduce_mean(self.losses) with tf.variable_scope("accuracy"): self.correct_predictions = tf.equal(self.predictions, tf.argmax(self.target, 1)) self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy") params = tf.trainable_variables() if not forward_only: with tf.name_scope("train") as scope: opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.losses, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, self.max_gradient_norm) with tf.name_scope("grad_norms") as scope: grad_summ = tf.scalar_summary("grad_norms", norm) self.update = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) loss_summ = tf.scalar_summary( "{0}_loss".format(self.str_summary_type), self.mean_loss) acc_summ = tf.scalar_summary( "{0}_accuracy".format(self.str_summary_type), self.accuracy) self.merged = tf.merge_summary([loss_summ, acc_summ]) self.saver = tf.train.Saver(tf.all_variables())
def __init__(self, classifier, input_dim, max_input_length, max_target_length, init_learning_rate, learning_rate_decay, num_steps, numutterances_per_minibatch): ''' NnetTrainer constructor, creates the training graph Args: classifier: the neural net classifier that will be trained input_dim: the input dimension to the nnnetgraph max_input_length: the maximal length of the input sequences max_target_length: the maximal length of the target sequences init_learning_rate: the initial learning rate learning_rate_decay: the parameter for exponential learning rate decay num_steps: the total number of steps that will be taken numutterances_per_minibatch: determines how many utterances are processed at a time to limit memory usage ''' self.numutterances_per_minibatch = numutterances_per_minibatch self.max_input_length = max_input_length self.max_target_length = max_target_length #create the graph self.graph = tf.Graph() #define the placeholders in the graph with self.graph.as_default(): #create the inputs placeholder self.inputs = tf.placeholder(tf.float32, shape=[ max_input_length, numutterances_per_minibatch, input_dim ], name='inputs') #split the 3D input tensor in a list of batch_size*input_dim tensors split_inputs = tf.unpack(self.inputs) #reference labels self.targets = tf.placeholder( tf.int32, shape=[max_target_length, numutterances_per_minibatch, 1], name='targets') #split the 3D targets tensor in a list of batch_size*input_dim #tensors split_targets = tf.unpack(self.targets) #the length of all the input sequences self.input_seq_length = tf.placeholder( tf.int32, shape=[numutterances_per_minibatch], name='input_seq_length') #the length of all the output sequences self.target_seq_length = tf.placeholder( tf.int32, shape=[numutterances_per_minibatch], name='output_seq_length') #compute the training outputs of the nnetgraph trainlogits, logit_seq_length, self.modelsaver, self.control_ops = ( classifier(split_inputs, self.input_seq_length, is_training=True, reuse=False, scope='Classifier')) #compute the validation output of the nnetgraph logits, _, _, _ = classifier(split_inputs, self.input_seq_length, is_training=False, reuse=True, scope='Classifier') #get a list of trainable variables in the decoder graph params = tf.trainable_variables() #add the variables and operations to the graph that are used for #training #total number of steps nsteps = tf.constant(num_steps, dtype=tf.int32, name='num_steps') #the total loss of the entire batch batch_loss = tf.get_variable( 'batch_loss', [], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) with tf.variable_scope('train_variables'): #the amount of steps already taken self.global_step = tf.get_variable( 'global_step', [], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) #a variable to scale the learning rate (used to reduce the #learning rate in case validation performance drops) learning_rate_fact = tf.get_variable( 'learning_rate_fact', [], initializer=tf.constant_initializer(1.0), trainable=False) #compute the learning rate with exponential decay and scale with #the learning rate factor learning_rate = tf.train.exponential_decay( init_learning_rate, self.global_step, nsteps, learning_rate_decay) * learning_rate_fact #create the optimizer optimizer = tf.train.AdamOptimizer(learning_rate) #for every parameter create a variable that holds its gradients with tf.variable_scope('gradients'): grads = [ tf.get_variable(param.op.name, param.get_shape().as_list(), initializer=tf.constant_initializer(0), trainable=False) for param in params ] with tf.name_scope('train'): #the total number of frames that are used in the batch num_frames = tf.get_variable( name='num_frames', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) #operation to update num_frames #pylint: disable=E1101 update_num_frames = num_frames.assign_add( tf.reduce_sum(self.target_seq_length)) #compute the training loss loss = self.compute_loss(split_targets, trainlogits, logit_seq_length, self.target_seq_length) #operation to half the learning rate self.halve_learningrate_op = learning_rate_fact.assign( learning_rate_fact / 2).op #create an operation to initialise the gradients self.init_grads = tf.initialize_variables(grads) #the operation to initialise the batch loss self.init_loss = batch_loss.initializer #pylint: disable=E1101 #the operation to initialize the num_frames #pylint: disable=E1101 self.init_num_frames = num_frames.initializer #compute the gradients of the batch batchgrads = tf.gradients(loss, params) #create an operation to update the batch loss #pylint: disable=E1101 self.update_loss = batch_loss.assign_add(loss) #create an operation to update the gradients, the batch_loss #and do all other update ops #pylint: disable=E1101 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.update_gradients_op = tf.group(*([ grads[p].assign_add(batchgrads[p]) for p in range(len(grads)) if batchgrads[p] is not None ] + [self.update_loss] + update_ops + [update_num_frames]), name='update_gradients') #create an operation to apply the gradients #average the gradients meangrads = [ tf.div(grad, tf.cast(num_frames, tf.float32), name=grad.op.name) for grad in grads ] #clip the gradients meangrads = [ tf.clip_by_value(grad, -1., 1.) for grad in meangrads ] #apply the gradients self.apply_gradients_op = optimizer.apply_gradients( [(meangrads[p], params[p]) for p in range(len(meangrads))], global_step=self.global_step, name='apply_gradients') with tf.name_scope('valid'): #compute the validation loss valid_loss = self.compute_loss(split_targets, logits, logit_seq_length, self.target_seq_length) #operation to update the validation loss #pylint: disable=E1101 self.update_valid_loss = tf.group( *([batch_loss.assign_add(valid_loss), update_num_frames])) #operation to compute the average loss in the batch self.average_loss = batch_loss / tf.cast(num_frames, tf.float32) # add an operation to initialise all the variables in the graph self.init_op = tf.initialize_all_variables() #saver for the training variables self.saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.VARIABLES, scope='train_variables')) #create the summaries for visualisation self.summary = tf.merge_summary([ tf.histogram_summary(val.name, val) for val in params + meangrads ] + [tf.scalar_summary('loss', self.average_loss)]) #specify that the graph can no longer be modified after this point self.graph.finalize() #start without visualisation self.summarywriter = None
def train(dataset): """Train on dataset for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (dataset.num_examples_per_epoch() / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(FLAGS.initial_learning_rate, global_step, decay_steps, FLAGS.learning_rate_decay_factor, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.RMSPropOptimizer(lr, RMSPROP_DECAY, momentum=RMSPROP_MOMENTUM, epsilon=RMSPROP_EPSILON) # Get images and labels for ImageNet and split the batch across GPUs. assert FLAGS.batch_size % FLAGS.num_gpus == 0, ( 'Batch size must be divisible by number of GPUs') split_batch_size = int(FLAGS.batch_size / FLAGS.num_gpus) # Override the number of preprocessing threads to account for the increased # number of GPU towers. num_preprocess_threads = FLAGS.num_preprocess_threads * FLAGS.num_gpus images, labels = image_processing.distorted_inputs( dataset, num_preprocess_threads=num_preprocess_threads) input_summaries = copy.copy(tf.get_collection(tf.GraphKeys.SUMMARIES)) # Number of classes in the Dataset label set plus 1. # Label 0 is reserved for an (unused) background class. num_classes = dataset.num_classes() + 1 # Split the batch of images and labels for towers. images_splits = tf.split(0, FLAGS.num_gpus, images) labels_splits = tf.split(0, FLAGS.num_gpus, labels) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope: # Force all Variables to reside on the CPU. with slim.arg_scope([slim.variables.variable], device='/cpu:0'): # Calculate the loss for one tower of the ImageNet model. This # function constructs the entire ImageNet model but shares the # variables across all towers. loss = _tower_loss(images_splits[i], labels_splits[i], num_classes, scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Retain the Batch Normalization updates operations only from the # final tower. Ideally, we should grab the updates from all towers # but these stats accumulate extremely fast so we can ignore the # other stats from the other towers without significant detriment. batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION, scope) # Calculate the gradients for the batch of data on this ImageNet # tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = _average_gradients(tower_grads) # Add a summaries for the input processing and global_step. summaries.extend(input_summaries) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. # Note that we maintain a "double-average" of the BatchNormalization # global statistics. This is more complicated then need be but we employ # this for backward-compatibility with our previous models. variable_averages = tf.train.ExponentialMovingAverage( inception.MOVING_AVERAGE_DECAY, global_step) # Another possiblility is to use tf.slim.get_variables(). variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) # Group all updates to into a single train op. batchnorm_updates_op = tf.group(*batchnorm_updates) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.initialize_all_variables() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) if FLAGS.pretrained_model_checkpoint_path: assert tf.gfile.Exists(FLAGS.pretrained_model_checkpoint_path) variables_to_restore = tf.get_collection( slim.variables.VARIABLES_TO_RESTORE) restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path) print('%s: Pre-trained model restored from %s' % (datetime.now(), FLAGS.pretrained_model_checkpoint_path)) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.train.SummaryWriter( FLAGS.train_dir, graph_def=sess.graph.as_graph_def(add_shapes=True)) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: examples_per_sec = FLAGS.batch_size / float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, duration)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def eval_loop(graph, eval_dir, train_dir, num_training_steps=10000, summary_frequency=10): """A generator which runs evaluation steps at each output. Args: graph: A tf.Graph object containing the model. eval_dir: A string path to the directory to write eval summary events. train_dir: A string path to the directory to search for checkpoints to eval. num_training_steps: Generator terminates after this many steps. summary_frequency: How many training iterations to run per generator iteration. Yields: A dict of training metrics, and runs summary_frequency training steps between each yield. If no checkpoints are found, None is yielded. """ cross_entropy = graph.get_collection('cross_entropy')[0] log_perplexity = graph.get_collection('log_perplexity')[0] accuracy = graph.get_collection('accuracy')[0] global_step = graph.get_collection('global_step')[0] with graph.as_default(): summary_op = tf.merge_summary([ tf.scalar_summary('cross_entropy_loss', cross_entropy), tf.scalar_summary('log_perplexity', log_perplexity), tf.scalar_summary('accuracy', accuracy) ]) saver = tf.train.Saver() session = tf.Session(graph=graph) summary_writer = tf.train.SummaryWriter(eval_dir, session.graph) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=session, coord=coord) gs = 0 logging.info('Starting eval loop') try: while gs < num_training_steps: checkpoint_path = tf.train.latest_checkpoint(train_dir) if not checkpoint_path: logging.info('Waiting for checkpoint file in directory %s', train_dir) yield continue saver.restore(session, checkpoint_path) ce, lp, a, gs, serialized_summaries = session.run([ cross_entropy, log_perplexity, accuracy, global_step, summary_op ]) logging.info( 'Global Step: %s - Loss: %.3f - Log-perplexity: %.3f - ' 'Step Accuracy: %.2f', gs, ce, lp, a) summary_writer.add_summary(serialized_summaries, global_step=gs) summary_writer.flush() yield { 'loss': ce, 'log_perplexity': lp, 'accuracy': a, 'global_step': gs } except tf.errors.OutOfRangeError as e: logging.warn('Got error reported to coordinator: %s', e) finally: coord.request_stop() summary_writer.close() coord.join(threads)
def build_graph(mode, config, sequence_example_file_paths=None): """Builds the TensorFlow graph. Args: mode: 'train', 'eval', or 'generate'. Only mode related ops are added to the graph. config: An EventSequenceRnnConfig containing the encoder/decoder and HParams to use. sequence_example_file_paths: A list of paths to TFRecord files containing tf.train.SequenceExample protos. Only needed for training and evaluation. May be a sharded file of the form. Returns: A tf.Graph instance which contains the TF ops. Raises: ValueError: If mode is not 'train', 'eval', or 'generate'. """ if mode not in ('train', 'eval', 'generate'): raise ValueError("The mode parameter must be 'train', 'eval', " "or 'generate'. The mode parameter was: %s" % mode) hparams = config.hparams encoder_decoder = config.encoder_decoder tf.logging.info('hparams = %s', hparams.values()) input_size = encoder_decoder.input_size num_classes = encoder_decoder.num_classes no_event_label = encoder_decoder.default_event_label with tf.Graph().as_default() as graph: inputs, labels, lengths, = None, None, None state_is_tuple = True if mode == 'train' or mode == 'eval': inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size) elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) # If state_is_tuple is True, the output RNN cell state will be a tuple # instead of a tensor. During training and evaluation this improves # performance. However, during generation, the RNN cell state is fed # back into the graph with a feed dict. Feed dicts require passed in # values to be tensors and not tuples, so state_is_tuple is set to False. state_is_tuple = False cell = make_rnn_cell(hparams.rnn_layer_sizes, dropout_keep_prob=hparams.dropout_keep_prob, attn_length=hparams.attn_length, state_is_tuple=state_is_tuple) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, lengths, initial_state, parallel_iterations=1, swap_memory=True) outputs_flat = tf.reshape(outputs, [-1, hparams.rnn_layer_sizes[-1]]) logits_flat = tf.contrib.layers.linear(outputs_flat, num_classes) if mode == 'train' or mode == 'eval': if hparams.skip_first_n_losses: logits = tf.reshape(logits_flat, [hparams.batch_size, -1, num_classes]) logits = logits[:, hparams.skip_first_n_losses:, :] logits_flat = tf.reshape(logits, [-1, num_classes]) labels = labels[:, hparams.skip_first_n_losses:] labels_flat = tf.reshape(labels, [-1]) softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits_flat, labels_flat) loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.reduce_mean(tf.exp(softmax_cross_entropy)) correct_predictions = tf.to_float( tf.nn.in_top_k(logits_flat, labels_flat, 1)) accuracy = tf.reduce_mean(correct_predictions) * 100 event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label)) event_accuracy = tf.truediv( tf.reduce_sum(tf.mul(correct_predictions, event_positions)), tf.reduce_sum(event_positions)) * 100 no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label)) no_event_accuracy = tf.truediv( tf.reduce_sum(tf.mul(correct_predictions, no_event_positions)), tf.reduce_sum(no_event_positions)) * 100 global_step = tf.Variable(0, trainable=False, name='global_step') tf.add_to_collection('loss', loss) tf.add_to_collection('perplexity', perplexity) tf.add_to_collection('accuracy', accuracy) tf.add_to_collection('global_step', global_step) summaries = [ tf.scalar_summary('loss', loss), tf.scalar_summary('perplexity', perplexity), tf.scalar_summary('accuracy', accuracy), tf.scalar_summary('event_accuracy', event_accuracy), tf.scalar_summary('no_event_accuracy', no_event_accuracy), ] if mode == 'train': learning_rate = tf.train.exponential_decay( hparams.initial_learning_rate, global_step, hparams.decay_steps, hparams.decay_rate, staircase=True, name='learning_rate') opt = tf.train.AdamOptimizer(learning_rate) params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, hparams.clip_norm) train_op = opt.apply_gradients(zip(clipped_gradients, params), global_step) tf.add_to_collection('learning_rate', learning_rate) tf.add_to_collection('train_op', train_op) summaries.append(tf.scalar_summary('learning_rate', learning_rate)) if mode == 'eval': summary_op = tf.merge_summary(summaries) tf.add_to_collection('summary_op', summary_op) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) tf.add_to_collection('inputs', inputs) tf.add_to_collection('initial_state', initial_state) tf.add_to_collection('final_state', final_state) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) return graph
def pre_train_layer(self, depth, data, epoch): sess = self.sess print 'Starting to pretrain layer %d.' % depth hidden_layer = self.hidden_layers[depth] summary_list = [] with tf.name_scope(hidden_layer.name): with tf.name_scope("x_values"): x_original = tf.placeholder(tf.float32, shape=[None, self.input_dim]) x_latent = self.get_encoded_input(x_original, depth) x_corrupt = corrupt(x_latent, corruption_level=self.noise) with tf.name_scope("encoded_and_decoded"): encoded = hidden_layer.encode(x_corrupt) encoded = tf.nn.dropout(encoded, keep_prob=0.5) decoded = hidden_layer.decode(encoded) attach_variable_summaries(encoded, "encoded", summ_list=summary_list) attach_variable_summaries(decoded, "decoded", summ_list=summary_list) # Reconstruction loss with tf.name_scope("reconstruction_loss"): # loss = self.get_loss(x_latent, decoded) loss = self.get_l2_loss(x_latent, decoded) attach_scalar_summary(loss, "%s_loss" % 'l2_loss', summ_list=summary_list) trainable_vars = [ hidden_layer.weights, hidden_layer.biases, hidden_layer.decode_biases ] # Only optimize variables for this layer ("greedy") with tf.name_scope("train_step"): train_op = tf.train.AdamOptimizer( learning_rate=self.pretrain_lr).minimize( loss, var_list=trainable_vars) sess.run(tf.initialize_all_variables()) # Merge summaries and get a summary writer merged = tf.merge_summary(summary_list) pretrain_writer = tf.train.SummaryWriter( "model/" + hidden_layer.name, sess.graph) step = 0 for i in range(epoch): np.random.shuffle(data) batches = [ _ for _ in utilities.gen_batches(data, FLAGS.batch_size) ] for batch_x_original in batches: sess.run(train_op, feed_dict={x_original: batch_x_original}) if step % self.print_step == 0: loss_value = sess.run( loss, feed_dict={x_original: batch_x_original}) endoce_mean = sess.run( tf.reduce_mean(encoded), feed_dict={x_original: batch_x_original}) print("Step %s, batch %s loss = %s, weights_mean=%s" % (step, 'l2_loss', loss_value, endoce_mean)) if step % FLAGS.log_step == 0: summary = sess.run( merged, feed_dict={x_original: batch_x_original}) pretrain_writer.add_summary(summary, global_step=step) # Break for debugging purposes if FLAGS.debug and step > 5: break step += 1 print( "Finished pretraining of layer %d. Updated layer weights and biases." % depth)
# Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.merge_summary(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", cnn.loss) acc_summary = tf.scalar_summary("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
def finetune_parameters_gen(self, xy_train_gen, output_dim, epochs): """An implementation of finetuning to support data feeding from generators.""" sess = self.sess summary_list = [] print("Starting to fine tune parameters of network.") with tf.name_scope("finetuning"): with tf.name_scope("inputs"): x = tf.placeholder(tf.float32, shape=[None, self.input_dim], name="raw_input") with tf.name_scope("fully_encoded"): x_encoded = self.get_encoded_input( x, depth=-1) # Full depth encoding """Note on W below: The difference between self.output_dim and output_dim is that the former is the output dimension of the autoencoder stack, which is the dimension of the new feature space. The latter is the dimension of the y value space for classification. Ex: If the output should be binary, then the output_dim = 2.""" with tf.name_scope("softmax_variables"): self.W = weight_variable(self.output_dim, output_dim, name="weights") self.b = bias_variable(output_dim, initial_value=0, name="biases") attach_variable_summaries(self.W, self.W.name, summ_list=summary_list) attach_variable_summaries(self.b, self.b.name, summ_list=summary_list) with tf.name_scope("outputs"): y_logits = tf.matmul(x_encoded, self.W) + self.b with tf.name_scope("predicted"): y_pred = tf.nn.softmax(y_logits, name="y_pred") attach_variable_summaries(y_pred, y_pred.name, summ_list=summary_list) with tf.name_scope("actual"): y_actual = tf.placeholder(tf.float32, shape=[None, output_dim], name="y_actual") attach_variable_summaries(y_actual, y_actual.name, summ_list=summary_list) with tf.name_scope("cross_entropy"): cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( y_logits, y_actual)) attach_scalar_summary(cross_entropy, "cross_entropy", summ_list=summary_list) trainable_vars = self.get_all_variables( additional_layer=[self.W, self.b]) with tf.name_scope("train_step"): train_step = tf.train.AdamOptimizer( learning_rate=self.finetune_lr).minimize( cross_entropy, var_list=trainable_vars) with tf.name_scope("evaluation"): correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.argmax(y_actual, 1)) accuracy = tf.reduce_mean( tf.cast(correct_prediction, tf.float32)) attach_scalar_summary(accuracy, "finetune_accuracy", summ_list=summary_list) sess.run(tf.initialize_all_variables()) # Merge summaries and get a summary writer merged = tf.merge_summary(summary_list) train_writer = tf.train.SummaryWriter( FLAGS.log_dir + "/train/finetune", sess.graph) step = 0 for i in range(epochs): np.random.shuffle(xy_train_gen) train_data_batchs = [ _ for _ in utilities.gen_batches(xy_train_gen, FLAGS.batch_size) ] for batch in train_data_batchs: batch_xs, batch_ys = zip(*batch) # print 'get xs batch size===', len(batch_xs), type(batch_xs[0]), batch_xs[0].shape # print 'get ys batch size===', len(batch_ys), type(batch_ys[0]), batch_ys[0].shape if step % self.print_step == 0: print( "Step %s, batch accuracy: " % step, sess.run(accuracy, feed_dict={ x: batch_xs, y_actual: batch_ys })) # For debugging predicted y values if step % (self.print_step * 10) == 0: print("Predicted y-value:", sess.run(y_pred, feed_dict={x: batch_xs})[0]) print("Actual y-value:", batch_ys[0]) if step % FLAGS.log_step == 0: summary = sess.run(merged, feed_dict={ x: batch_xs, y_actual: batch_ys }) train_writer.add_summary(summary, global_step=step) # For debugging, break early. if FLAGS.debug and step > 5: break sess.run(train_step, feed_dict={ x: batch_xs, y_actual: batch_ys }) step += 1 print("Completed fine-tuning of parameters.") tuned_params = { "layer1_weights": sess.run(self.hidden_layers[0].get_weight_variable()), "layer2_weights": sess.run(self.hidden_layers[1].get_weight_variable()), "layer3_weights": sess.run(self.hidden_layers[2].get_weight_variable()), "weights": sess.run(self.W), "biases": sess.run(self.b) } return tuned_params
def __init__( self, conf, images=None, actions=None, states=None, sequence_length=None, reuse_scope=None, ): from prediction_hiddenstate import construct_model if sequence_length is None: sequence_length = conf['sequence_length'] self.prefix = prefix = tf.placeholder(tf.string, []) self.iter_num = tf.placeholder(tf.float32, []) summaries = [] # Split into timesteps. actions = tf.split(1, actions.get_shape()[1], actions) actions = [tf.squeeze(act) for act in actions] states = tf.split(1, states.get_shape()[1], states) states = [tf.squeeze(st) for st in states] images = tf.split(1, images.get_shape()[1], images) images = [tf.squeeze(img) for img in images] if reuse_scope is None: gen_images, gen_states, gen_masks, inf_low_state, pred_low_state = construct_model( images, actions, states, iter_num=self.iter_num, k=conf['schedsamp_k'], use_state=conf['use_state'], context_frames=conf['context_frames'], conf=conf) else: # If it's a validation or test model. with tf.variable_scope(reuse_scope, reuse=True): gen_images, gen_states, gen_masks, inf_low_state, pred_low_state = construct_model( images, actions, states, iter_num=self.iter_num, k=conf['schedsamp_k'], use_state=conf['use_state'], context_frames=conf['context_frames'], conf=conf) self.inf_low_state = inf_low_state self.gen_images = gen_images self.gen_masks = gen_masks self.gen_states = gen_states self.lr = tf.placeholder_with_default(conf['learning_rate'], ()) if 'prop_latent' in conf: return # do not do backprop when visualizing latent model forward propagation # L2 loss, PSNR for eval. loss, psnr_all = 0.0, 0.0 for i, x, gx in zip(range(len(gen_images)), images[conf['context_frames']:], gen_images[conf['context_frames'] - 1:]): recon_cost = mean_squared_error(x, gx) psnr_i = peak_signal_to_noise_ratio(x, gx) psnr_all += psnr_i summaries.append( tf.scalar_summary(prefix + '_recon_cost' + str(i), recon_cost)) summaries.append( tf.scalar_summary(prefix + '_psnr' + str(i), psnr_i)) loss += recon_cost for i, state, gen_state in zip( range(len(gen_states)), states[conf['context_frames']:], gen_states[conf['context_frames'] - 1:]): state_cost = mean_squared_error( state, gen_state) * 1e-4 * conf['use_state'] summaries.append( tf.scalar_summary(prefix + '_state_cost' + str(i), state_cost)) loss += state_cost summaries.append(tf.scalar_summary(prefix + '_psnr_all', psnr_all)) self.psnr_all = psnr_all self.loss = loss = loss / np.float32( len(images) - conf['context_frames']) summaries.append(tf.scalar_summary(prefix + '_loss', loss)) if 'train_latent_model' in conf: lt_state_cost_accum = 0.0 for i, inf_state, pred_state in zip(range(len(inf_low_state)), inf_low_state[1:], pred_low_state[:-1]): lt_state_cost = mean_squared_error( inf_state, pred_state) * conf['lt_state_factor'] summaries.append( tf.scalar_summary(prefix + '_low_state_cost' + str(i + 1), lt_state_cost)) lt_state_cost_accum += lt_state_cost if not 'joint' in conf: lt_model_var = tf.get_default_graph().get_collection( name=tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/latent_model') train_lt_op = tf.train.AdamOptimizer(self.lr).minimize( lt_state_cost_accum, var_list=lt_model_var) with tf.control_dependencies([train_lt_op]): self.train_op = tf.train.AdamOptimizer( self.lr).minimize(loss) else: loss += lt_state_cost_accum self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) else: self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) self.summ_op = tf.merge_summary(summaries)
def define_graph(self): """ Sets up the model graph in TensorFlow. """ with tf.name_scope('generator'): ## # Data ## with tf.name_scope('data'): self.input_frames_train = tf.placeholder(tf.float32, shape=[ None, self.height_train, self.width_train, 3 * c.HIST_LEN ]) self.gt_frames_train = tf.placeholder( tf.float32, shape=[None, self.height_train, self.width_train, 3]) self.input_frames_test = tf.placeholder(tf.float32, shape=[ None, self.height_test, self.width_test, 3 * c.HIST_LEN ]) self.gt_frames_test = tf.placeholder( tf.float32, shape=[None, self.height_test, self.width_test, 3]) # use variable batch_size for more flexibility self.batch_size_train = tf.shape(self.input_frames_train)[0] self.batch_size_test = tf.shape(self.input_frames_test)[0] ## # Scale network setup and calculation ## self.summaries_train = [] self.scale_preds_train = [] # the generated images at each scale self.scale_gts_train = [] # the ground truth images at each scale self.d_scale_preds = [ ] # the predictions from the discriminator model self.summaries_test = [] self.scale_preds_test = [] # the generated images at each scale self.scale_gts_test = [] # the ground truth images at each scale for scale_num in xrange(self.num_scale_nets): with tf.name_scope('scale_' + str(scale_num)): with tf.name_scope('setup'): ws = [] bs = [] # create weights for kernels for i in xrange(len( self.scale_kernel_sizes[scale_num])): ws.append( w([ self.scale_kernel_sizes[scale_num][i], self.scale_kernel_sizes[scale_num][i], self.scale_layer_fms[scale_num][i], self.scale_layer_fms[scale_num][i + 1] ])) bs.append( b([self.scale_layer_fms[scale_num][i + 1]])) with tf.name_scope('calculation'): def calculate(height, width, inputs, gts, last_gen_frames): # scale inputs and gts scale_factor = 1. / 2**( (self.num_scale_nets - 1) - scale_num) scale_height = int(height * scale_factor) scale_width = int(width * scale_factor) inputs = tf.image.resize_images( inputs, [scale_height, scale_width]) scale_gts = tf.image.resize_images( gts, [scale_height, scale_width]) # for all scales but the first, add the frame generated by the last # scale to the input if scale_num > 0: last_gen_frames = tf.image.resize_images( last_gen_frames, [scale_height, scale_width]) inputs = tf.concat(3, [inputs, last_gen_frames]) # generated frame predictions preds = inputs # perform convolutions with tf.name_scope('convolutions'): for i in xrange( len(self.scale_kernel_sizes[scale_num]) ): # Convolve layer preds = tf.nn.conv2d(preds, ws[i], [1, 1, 1, 1], padding=c.PADDING_G) # Activate with ReLU (or Tanh for last layer) if i == len( self.scale_kernel_sizes[scale_num] ) - 1: preds = tf.nn.tanh(preds + bs[i]) else: preds = tf.nn.relu(preds + bs[i]) return preds, scale_gts ## # Perform train calculation ## # for all scales but the first, add the frame generated by the last # scale to the input if scale_num > 0: last_scale_pred_train = self.scale_preds_train[ scale_num - 1] else: last_scale_pred_train = None # calculate train_preds, train_gts = calculate( self.height_train, self.width_train, self.input_frames_train, self.gt_frames_train, last_scale_pred_train) self.scale_preds_train.append(train_preds) self.scale_gts_train.append(train_gts) # We need to run the network first to get generated frames, run the # discriminator on those frames to get d_scale_preds, then run this # again for the loss optimization. if c.ADVERSARIAL: self.d_scale_preds.append( tf.placeholder(tf.float32, [None, 1])) ## # Perform test calculation ## # for all scales but the first, add the frame generated by the last # scale to the input if scale_num > 0: last_scale_pred_test = self.scale_preds_test[ scale_num - 1] else: last_scale_pred_test = None # calculate test_preds, test_gts = calculate( self.height_test, self.width_test, self.input_frames_test, self.gt_frames_test, last_scale_pred_test) self.scale_preds_test.append(test_preds) self.scale_gts_test.append(test_gts) ## # Training ## with tf.name_scope('train'): # global loss is the combined loss from every scale network self.global_loss = combined_loss(self.scale_preds_train, self.scale_gts_train, self.d_scale_preds) self.global_step = tf.Variable(0, trainable=False) self.optimizer = tf.train.AdamOptimizer( learning_rate=c.LRATE_G, name='optimizer') self.train_op = self.optimizer.minimize( self.global_loss, global_step=self.global_step, name='train_op') # train loss summary loss_summary = tf.scalar_summary('train_loss_G', self.global_loss) self.summaries_train.append(loss_summary) ## # Error ## with tf.name_scope('error'): # error computation # get error at largest scale self.psnr_error_train = psnr_error(self.scale_preds_train[-1], self.gt_frames_train) self.sharpdiff_error_train = sharp_diff_error( self.scale_preds_train[-1], self.gt_frames_train) self.psnr_error_test = psnr_error(self.scale_preds_test[-1], self.gt_frames_test) self.sharpdiff_error_test = sharp_diff_error( self.scale_preds_test[-1], self.gt_frames_test) # train error summaries summary_psnr_train = tf.scalar_summary('train_PSNR', self.psnr_error_train) summary_sharpdiff_train = tf.scalar_summary( 'train_SharpDiff', self.sharpdiff_error_train) self.summaries_train += [ summary_psnr_train, summary_sharpdiff_train ] # test error summary_psnr_test = tf.scalar_summary('test_PSNR', self.psnr_error_test) summary_sharpdiff_test = tf.scalar_summary( 'test_SharpDiff', self.sharpdiff_error_test) self.summaries_test += [ summary_psnr_test, summary_sharpdiff_test ] # add summaries to visualize in TensorBoard self.summaries_train = tf.merge_summary(self.summaries_train) self.summaries_test = tf.merge_summary(self.summaries_test)
def _eval_once(saver, summary_writer, rmse_op, summary_op): """Runs Eval once. Args: saver: Saver. summary_writer: Summary writer. rmse_op: rmse_op. summary_op: Summary op. """ with tf.Session() as sess: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: if os.path.isabs(ckpt.model_checkpoint_path): # Restores from checkpoint with absolute path. saver.restore(sess, ckpt.model_checkpoint_path) else: # Restores from checkpoint with relative path. saver.restore( sess, os.path.join(FLAGS.checkpoint_dir, ckpt.model_checkpoint_path)) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/imagenet_train/model.ckpt-0, # extract global_step from it. global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] print('Succesfully loaded model from %s at step=%s.' % (ckpt.model_checkpoint_path, global_step)) else: print('No checkpoint file found') return # Start the queue runners. coord = tf.train.Coordinator() try: threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) # Counts the number of correct predictions. errors = [] total_sample_count = num_iter * FLAGS.batch_size step = 0 print('%s: starting evaluation on (%s).' % (datetime.now(), FLAGS.dataset_path)) start_time = time.time() while step < num_iter and not coord.should_stop(): rmse = sess.run(rmse_op) errors.append(rmse) step += 1 if step % 20 == 0: duration = time.time() - start_time sec_per_batch = duration / 20.0 examples_per_sec = FLAGS.batch_size / sec_per_batch print('%s: [%d batches out of %d] (%.1f examples/sec; %.3f' 'sec/batch)' % (datetime.now(), step, num_iter, examples_per_sec, sec_per_batch)) start_time = time.time() errors = np.vstack(errors).ravel() mean_rmse = errors.mean() auc_at_08 = (errors < .08).mean() auc_at_05 = (errors < .05).mean() ced_image = plot_ced([errors.tolist()]) ced_plot = sess.run( tf.merge_summary( [tf.image_summary('ced_plot', ced_image[None, ...])])) print('Errors', errors.shape) print( '%s: mean_rmse = %.4f, auc @ 0.05 = %.4f, auc @ 0.08 = %.4f [%d examples]' % (datetime.now(), errors.mean(), auc_at_05, auc_at_08, total_sample_count)) summary = tf.Summary() summary.ParseFromString(sess.run(summary_op)) summary.value.add(tag='AUC @ 0.08', simple_value=float(auc_at_08)) summary.value.add(tag='AUC @ 0.05', simple_value=float(auc_at_05)) summary.value.add(tag='Mean RMSE', simple_value=float(mean_rmse)) summary_writer.add_summary(ced_plot, global_step) summary_writer.add_summary(summary, global_step) except Exception as e: # pylint: disable=broad-except coord.request_stop(e) coord.request_stop() coord.join(threads, stop_grace_period_secs=10)
def main(_): """Build the full graph for feeding inputs, training, and saving checkpoints. Run the training. Then, load the saved graph and run some predictions.""" # Get input data: get the sets of images and labels for training, # validation, and test on MNIST. data_sets = read_data_sets(FLAGS.data_dir, False) mnist_graph = tf.Graph() with mnist_graph.as_default(): # Generate placeholders for the images and labels. images_placeholder = tf.placeholder(tf.float32) labels_placeholder = tf.placeholder(tf.int32) tf.add_to_collection("images", images_placeholder) # Remember this Op. tf.add_to_collection("labels", labels_placeholder) # Remember this Op. # Build a Graph that computes predictions from the inference model. logits = mnist_inference(images_placeholder, HIDDEN1_UNITS) tf.add_to_collection("logits", logits) # Remember this Op. # Add to the Graph the Ops that calculate and apply gradients. train_op, loss = mnist_training(logits, labels_placeholder, 0.01) # prediction accuracy _, indices_op = tf.nn.top_k(logits) flattened = tf.reshape(indices_op, [-1]) correct_prediction = tf.cast(tf.equal(labels_placeholder, flattened), tf.float32) accuracy = tf.reduce_mean(correct_prediction) # Define info to be used by the SummaryWriter. This will let # TensorBoard plot values during the training process. loss_summary = tf.scalar_summary("loss", loss) train_summary_op = tf.merge_summary([loss_summary]) # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a summary writer. print("Writing Summaries to %s" % FLAGS.model_dir) train_summary_writer = tf.train.SummaryWriter(FLAGS.model_dir) # Run training and save checkpoint at the end. with tf.Session(graph=mnist_graph) as sess: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(FLAGS.num_steps): # Read a batch of images and labels. images_feed, labels_feed = data_sets.train.next_batch(BATCH_SIZE) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value, tsummary, acc = sess.run( [train_op, loss, train_summary_op, accuracy], feed_dict={ images_placeholder: images_feed, labels_placeholder: labels_feed }) if step % 100 == 0: # Write summary info train_summary_writer.add_summary(tsummary, step) if step % 1000 == 0: # Print loss/accuracy info print('----Step %d: loss = %.4f' % (step, loss_value)) print("accuracy: %s" % acc) print("\nWriting checkpoint file.") checkpoint_file = os.path.join(FLAGS.model_dir, 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) _, loss_value = sess.run( [train_op, loss], feed_dict={ images_placeholder: data_sets.test.images, labels_placeholder: data_sets.test.labels }) print("Test set loss: %s" % loss_value) # Run evaluation based on the saved checkpoint. with tf.Session(graph=tf.Graph()) as sess: checkpoint_file = tf.train.latest_checkpoint(FLAGS.model_dir) print("\nRunning predictions based on saved checkpoint.") print("checkpoint file: {}".format(checkpoint_file)) # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Retrieve the Ops we 'remembered'. logits = tf.get_collection("logits")[0] images_placeholder = tf.get_collection("images")[0] labels_placeholder = tf.get_collection("labels")[0] # Add an Op that chooses the top k predictions. eval_op = tf.nn.top_k(logits) # Run evaluation. images_feed, labels_feed = data_sets.validation.next_batch( EVAL_BATCH_SIZE) prediction = sess.run(eval_op, feed_dict={ images_placeholder: images_feed, labels_placeholder: labels_feed }) for i in range(len(labels_feed)): print("Ground truth: %d\nPrediction: %d" % (labels_feed[i], prediction.indices[i][0]))
def define_model(self): ''' 定义我的的计算图谱 ''' def model(data_flow, train=True): ''' @data: original inputs @return: logits ''' # Define Convolutional Layers for i, (weights, biases, config) in enumerate(zip(self.conv_weights, self.conv_biases, self.conv_config)): with tf.name_scope(config['name'] + '_model'): with tf.name_scope('convolution'): # default 1,1,1,1 stride and SAME padding data_flow = tf.nn.conv2d(data_flow, filter=weights, strides=[1, 1, 1, 1], padding='SAME') data_flow = data_flow + biases if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1), name=config['name']+'_conv') if config['activation'] == 'relu': data_flow = tf.nn.relu(data_flow) if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1), name=config['name']+'_relu') else: raise Exception('Activation Func can only be Relu right now. You passed', config['activation']) if config['pooling']: data_flow = tf.nn.max_pool( data_flow, ksize=[1, self.pooling_scale, self.pooling_scale, 1], strides=[1, self.pooling_stride, self.pooling_stride, 1], padding='SAME') if not train: self.visualize_filter_map(data_flow, how_many=config['out_depth'], display_size=32//(i//2+1)//2, name=config['name']+'_pooling') # Define Fully Connected Layers for i, (weights, biases, config) in enumerate(zip(self.fc_weights, self.fc_biases, self.fc_config)): if i == 0: shape = data_flow.get_shape().as_list() data_flow = tf.reshape(data_flow, [shape[0], shape[1] * shape[2] * shape[3]]) with tf.name_scope(config['name'] + 'model'): data_flow = tf.matmul(data_flow, weights) + biases if config['activation'] == 'relu': data_flow = tf.nn.relu(data_flow) elif config['activation'] is None: pass else: raise Exception('Activation Func can only be Relu or None right now. You passed', config['activation']) return data_flow # Training computation. logits = model(self.tf_train_samples) with tf.name_scope('loss'): self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, self.tf_train_labels)) self.train_summaries.append(tf.scalar_summary('Loss', self.loss)) # Optimizer. with tf.name_scope('optimizer'): self.optimizer = tf.train.GradientDescentOptimizer(0.0001).minimize(self.loss) # Predictions for the training, validation, and test data. with tf.name_scope('train'): self.train_prediction = tf.nn.softmax(logits, name='train_prediction') with tf.name_scope('test'): self.test_prediction = tf.nn.softmax(model(self.tf_test_samples, train=False), name='test_prediction') self.merged_train_summary = tf.merge_summary(self.train_summaries) self.merged_test_summary = tf.merge_summary(self.test_summaries)
def train(): args = parse_args() np.random.seed(args.seed) batch_size = args.batch_size n_epochs = args.epochs use_batch_norm = args.use_batch_norm fix_std = args.fix_std plot_every = args.plot_every use_infogan = args.infogan style_size = args.style_size categorical_cardinality = args.categorical_cardinality num_continuous = args.num_continuous generator_desc = args.generator discriminator_desc = args.discriminator if args.dataset is None: assert args.scale_dataset == [28, 28] X = load_mnist_dataset() if args.max_images is not None: X = X[:args.max_images] dataset_name = "mnist" else: scaled_image_width, scaled_image_height = args.scale_dataset # load pngs and jpegs here X = load_image_dataset( args.dataset, desired_width= scaled_image_width, # TODO(jonathan): pick up from generator or add a command line arg (either or)... desired_height=scaled_image_height, value_range=(0.0, 1.0), max_images=args.max_images, force_grayscale=args.force_grayscale) dataset_name = basename(args.dataset.rstrip("/")) if use_infogan: z_size = style_size + sum(categorical_cardinality) + num_continuous sample_noise = create_infogan_noise_sample(categorical_cardinality, num_continuous, style_size) else: z_size = style_size sample_noise = create_gan_noise_sample(style_size) discriminator_lr = tf.get_variable("discriminator_lr", (), initializer=tf.constant_initializer( args.discriminator_lr)) generator_lr = tf.get_variable("generator_lr", (), initializer=tf.constant_initializer( args.generator_lr)) n_images, image_height, image_width, n_channels = X.shape discriminator_lr_placeholder = tf.placeholder(tf.float32, (), name="discriminator_lr") generator_lr_placeholder = tf.placeholder(tf.float32, (), name="generator_lr") assign_discriminator_lr_op = discriminator_lr.assign( discriminator_lr_placeholder) assign_generator_lr_op = generator_lr.assign(generator_lr_placeholder) ## begin model true_images = tf.placeholder(tf.float32, [None, image_height, image_width, n_channels], name="true_images") zc_vectors = tf.placeholder(tf.float32, [None, z_size], name="zc_vectors") is_training_discriminator = tf.placeholder( tf.bool, [], name="is_training_discriminator") is_training_generator = tf.placeholder(tf.bool, [], name="is_training_generator") fake_images = generator_forward(zc_vectors, generator_desc, is_training=is_training_generator, name="generator", debug=True) print("Generator produced images of shape %s" % (fake_images.get_shape()[1:])) print("") discriminator_fake = discriminator_forward( fake_images, discriminator_desc, is_training=is_training_discriminator, name="discriminator", use_batch_norm=use_batch_norm, debug=True) prob_fake = discriminator_fake["prob"] discriminator_true = discriminator_forward( true_images, discriminator_desc, is_training=is_training_discriminator, reuse=True, name="discriminator", use_batch_norm=use_batch_norm) prob_true = discriminator_true["prob"] # discriminator should maximize: ll_believing_fake_images_are_fake = tf.log(1.0 - prob_fake + TINY) ll_true_images = tf.log(prob_true + TINY) discriminator_obj = (tf.reduce_mean(ll_believing_fake_images_are_fake) + tf.reduce_mean(ll_true_images)) # generator should maximize: ll_believing_fake_images_are_real = tf.reduce_mean(tf.log(prob_fake + TINY)) generator_obj = ll_believing_fake_images_are_real discriminator_solver = tf.train.AdamOptimizer( learning_rate=discriminator_lr, beta1=0.5) generator_solver = tf.train.AdamOptimizer(learning_rate=generator_lr, beta1=0.5) discriminator_variables = scope_variables("discriminator") generator_variables = scope_variables("generator") train_discriminator = discriminator_solver.minimize( -discriminator_obj, var_list=discriminator_variables) train_generator = generator_solver.minimize(-generator_obj, var_list=generator_variables) discriminator_obj_summary = tf.scalar_summary("discriminator_objective", discriminator_obj) generator_obj_summary = tf.scalar_summary("generator_objective", generator_obj) if use_infogan: categorical_c_vectors = [] offset = 0 for cardinality in categorical_cardinality: categorical_c_vectors.append(zc_vectors[:, offset:offset + cardinality]) offset += cardinality continuous_c_vector = zc_vectors[:, offset:offset + num_continuous] q_output = reconstruct_mutual_info( categorical_c_vectors, continuous_c_vector, categorical_lambda=args.categorical_lambda, continuous_lambda=args.continuous_lambda, fix_std=fix_std, hidden=discriminator_fake["hidden"], is_training=is_training_discriminator, name="mutual_info") mutual_info_objective = q_output["mutual_info"] mutual_info_variables = scope_variables("mutual_info") neg_mutual_info_objective = -mutual_info_objective train_mutual_info = generator_solver.minimize( neg_mutual_info_objective, var_list=generator_variables + discriminator_variables + mutual_info_variables) ll_categorical = q_output["ll_categorical"] ll_continuous = q_output["ll_continuous"] std_contig = q_output["std_contig"] mutual_info_obj_summary = tf.scalar_summary("mutual_info_objective", mutual_info_objective) ll_categorical_obj_summary = tf.scalar_summary( "ll_categorical_objective", ll_categorical) ll_continuous_obj_summary = tf.scalar_summary( "ll_continuous_objective", ll_continuous) std_contig_summary = tf.scalar_summary("std_contig", std_contig) generator_obj_summary = tf.merge_summary([ generator_obj_summary, mutual_info_obj_summary, ll_categorical_obj_summary, ll_continuous_obj_summary, std_contig_summary ]) else: neg_mutual_info_objective = NOOP mutual_info_objective = NOOP train_mutual_info = NOOP ll_categorical = NOOP ll_continuous = NOOP std_contig = NOOP entropy = NOOP log_dir = next_unused_name( join(PROJECT_DIR, "%s_log" % (dataset_name, ), "infogan" if use_infogan else "gan")) journalist = tf.train.SummaryWriter(log_dir, flush_secs=10) print("Saving tensorboard logs to %r" % (log_dir, )) img_summaries = {} if use_infogan: plotter = CategoricalPlotter( categorical_cardinality=categorical_cardinality, num_continuous=num_continuous, style_size=style_size, journalist=journalist, generate=lambda sess, x: sess.run( fake_images, { zc_vectors: x, is_training_discriminator: False, is_training_generator: False })) else: image_placeholder = None plotter = None img_summaries["fake_images"] = tf.image_summary("fake images", fake_images, max_images=10) image_summary_op = tf.merge_summary(list( img_summaries.values())) if len(img_summaries) else NOOP idxes = np.arange(n_images, dtype=np.int32) iters = 0 with tf.Session() as sess: # pleasure sess.run(tf.initialize_all_variables()) # content for epoch in range(n_epochs): disc_epoch_obj = [] gen_epoch_obj = [] infogan_epoch_obj = [] np.random.shuffle(idxes) pbar = create_progress_bar("epoch %d >> " % (epoch, )) for idx in pbar(range(0, n_images, batch_size)): batch = X[idxes[idx:idx + batch_size]] # train discriminator noise = sample_noise(batch_size) _, summary_result1, disc_obj, infogan_obj = sess.run( [ train_discriminator, discriminator_obj_summary, discriminator_obj, neg_mutual_info_objective ], feed_dict={ true_images: batch, zc_vectors: noise, is_training_discriminator: True, is_training_generator: True }) disc_epoch_obj.append(disc_obj) if use_infogan: infogan_epoch_obj.append(infogan_obj) # train generator noise = sample_noise(batch_size) _, _, summary_result2, gen_obj, infogan_obj = sess.run( [ train_generator, train_mutual_info, generator_obj_summary, generator_obj, neg_mutual_info_objective ], feed_dict={ zc_vectors: noise, is_training_discriminator: True, is_training_generator: True }) journalist.add_summary(summary_result1, iters) journalist.add_summary(summary_result2, iters) journalist.flush() gen_epoch_obj.append(gen_obj) if use_infogan: infogan_epoch_obj.append(infogan_obj) iters += 1 if iters % plot_every == 0: if use_infogan: plotter.generate_images(sess, 10, iteration=iters) else: noise = sample_noise(batch_size) current_summary = sess.run( image_summary_op, { zc_vectors: noise, is_training_discriminator: False, is_training_generator: False }) journalist.add_summary(current_summary, iters) journalist.flush() msg = "epoch %d >> discriminator LL %.2f (lr=%.6f), generator LL %.2f (lr=%.6f)" % ( epoch, np.mean(disc_epoch_obj), sess.run(discriminator_lr), np.mean(gen_epoch_obj), sess.run(generator_lr)) if use_infogan: msg = msg + ", infogan loss %.2f" % ( np.mean(infogan_epoch_obj), ) print(msg)
with open(meta_fname, 'w') as meta_file: meta_file.write("Meta-information\n") meta_file.write("Label: {0}\n".format(FLAGS.label)) if FLAGS.lstm: meta_file.write("LSTM\n") else: meta_file.write("CNN\n") meta_file.write("\nFlags:\n") meta_file.write(flags_to_string()) # Summaries for loss and accuracy loss_summary = tf.scalar_summary("loss", model.loss) acc_summary = tf.scalar_summary("accuracy", model.accuracy) # Train Summaries train_summary_op = tf.merge_summary([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir_full, "summaries", "train") train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_dir = os.path.join(out_dir_full, "summaries", "dev") dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir_full, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) # Initialize all variables and override pre-computed embeddings
def train(): """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(), tf.device('/cpu:0'): # Create a variable to count the number of train() calls. This equals the # number of batches processed * FLAGS.num_gpus. global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) # Calculate the learning rate schedule. num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size) decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE, global_step, decay_steps, cifar10.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. opt = tf.train.GradientDescentOptimizer(lr) # Calculate the gradients for each model tower. tower_grads = [] for i in xrange(FLAGS.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope: # Calculate the loss for one tower of the CIFAR model. This function # constructs the entire CIFAR model but shares the variables across # all towers. loss = tower_loss(scope) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Retain the summaries from the final tower. summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) # Calculate the gradients for the batch of data on this CIFAR tower. grads = opt.compute_gradients(loss) # Keep track of the gradients across all towers. tower_grads.append(grads) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = average_gradients(tower_grads) # Add a summary to track the learning rate. summaries.append(tf.scalar_summary('learning_rate', lr)) # Add histograms for gradients. for grad, var in grads: if grad is not None: summaries.append( tf.histogram_summary(var.op.name + '/gradients', grad)) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Add histograms for trainable variables. for var in tf.trainable_variables(): summaries.append(tf.histogram_summary(var.op.name, var)) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( cifar10.MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # Create a saver. saver = tf.train.Saver(tf.global_variables()) # Build the summary operation from the last tower summaries. summary_op = tf.merge_summary(summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. allow_soft_placement must be set to # True to build towers on GPU, as some of the ops do not have GPU # implementations. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) sess.run(init) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / FLAGS.num_gpus format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): ###################### # Config model_deploy# ###################### deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) # Create global_step with tf.device(deploy_config.variables_device()): global_step = slim.create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the network # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay, is_training=True) ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=True) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## with tf.device(deploy_config.inputs_device()): provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=FLAGS.num_readers, common_queue_capacity=20 * FLAGS.batch_size, common_queue_min=10 * FLAGS.batch_size) [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset train_image_size = FLAGS.train_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, train_image_size, train_image_size) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) labels = slim.one_hot_encoding( labels, dataset.num_classes - FLAGS.labels_offset) batch_queue = slim.prefetch_queue.prefetch_queue( [images, labels], capacity=2 * deploy_config.num_clones) #################### # Define the model # #################### def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" images, labels = batch_queue.dequeue() logits, end_points = network_fn(images) ############################# # Specify the loss function # ############################# if 'AuxLogits' in end_points: slim.losses.softmax_cross_entropy( end_points['AuxLogits'], labels, label_smoothing=FLAGS.label_smoothing, weight=0.4, scope='aux_loss') slim.losses.softmax_cross_entropy( logits, labels, label_smoothing=FLAGS.label_smoothing, weight=1.0) return end_points # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) first_clone_scope = deploy_config.clone_scope(0) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Add summaries for end_points. end_points = clones[0].outputs for end_point in end_points: x = end_points[end_point] summaries.add(tf.histogram_summary('activations/' + end_point, x)) summaries.add( tf.scalar_summary('sparsity/' + end_point, tf.nn.zero_fraction(x))) # Add summaries for losses. for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss)) # Add summaries for variables. for variable in slim.get_model_variables(): summaries.add(tf.histogram_summary(variable.op.name, variable)) ################################# # Configure the moving averages # ################################# if FLAGS.moving_average_decay: moving_average_variables = slim.get_model_variables() variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) else: moving_average_variables, variable_averages = None, None ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.optimizer_device()): learning_rate = _configure_learning_rate(dataset.num_samples, global_step) optimizer = _configure_optimizer(learning_rate) summaries.add( tf.scalar_summary('learning_rate', learning_rate, name='learning_rate')) if FLAGS.sync_replicas: # If sync_replicas is enabled, the averaging will be done in the chief # queue runner. optimizer = tf.train.SyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, variable_averages=variable_averages, variables_to_average=moving_average_variables, replica_id=tf.constant(FLAGS.task, tf.int32, shape=()), total_num_replicas=FLAGS.worker_replicas) elif FLAGS.moving_average_decay: # Update ops executed locally by trainer. update_ops.append( variable_averages.apply(moving_average_variables)) # Variables to train. variables_to_train = _get_variables_to_train() # and returns a train_tensor and summary_op total_loss, clones_gradients = model_deploy.optimize_clones( clones, optimizer, var_list=variables_to_train) # Add total_loss to summary. summaries.add( tf.scalar_summary('total_loss', total_loss, name='total_loss')) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, name='train_op') # Add the summaries from the first clone. These contain the summaries # created by model_fn and either optimize_clones() or _gather_clone_loss(). summaries |= set( tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) # Merge all summaries together. summary_op = tf.merge_summary(list(summaries), name='summary_op') ########################### # Kicks off the training. # ########################### slim.learning.train( train_tensor, logdir=FLAGS.train_dir, master=FLAGS.master, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, sync_optimizer=optimizer if FLAGS.sync_replicas else None)
concat_inputs) pred_vals = tf.cast(tf.argmax(unnormalized_probs, dimension = 1), tf.int32) normalized_probs = tf.nn.softmax(unnormalized_probs) one_hot_gold = tf.one_hot(gold_label_placeholder, 2) l2_loss = 0 for weight in rnn_weights + final_weights: l2_loss += tf.nn.l2_loss(weight) loss = tf.reduce_mean(-tf.log(tf.reduce_sum(tf.mul(normalized_probs, one_hot_gold), 1))) accuracy = tf.reduce_sum(tf.cast(tf.equal(gold_label_placeholder, pred_vals), tf.float32)) / tf.cast(batch_size_placeholder, tf.float32) loss_summary = tf.scalar_summary("loss", loss) acc_summary = tf.scalar_summary("acc", accuracy) summaries = tf.merge_summary([loss_summary, acc_summary]) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) ### Actually running stuff # Train step. with tf.Session(graph = graph) as session: session.run(tf.initialize_all_variables()) train_writer = tf.train.SummaryWriter("performance/train") dev_writer = tf.train.SummaryWriter("performance/dev") valid_writer = tf.train.SummaryWriter("performance/valid") step_num = 0 keep_training = True max_acc = 0
def run(self): inputs = tf.nn.embedding_lookup(self.word_embedding, self.x) prob = self.model(inputs) with tf.name_scope('loss'): cost = - tf.reduce_mean(self.y * tf.log(prob)) reg, variables = tf.nn.l2_loss(self.word_embedding), ['softmax'] for vari in variables: reg += tf.nn.l2_loss(self.weights[vari]) + \ tf.nn.l2_loss(self.biases[vari]) cost += reg * self.l2_reg with tf.name_scope('train'): global_step = tf.Variable( 0, name="tr_global_step", trainable=False) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize(cost, global_step=global_step) with tf.name_scope('predict'): correct_pred = tf.equal(tf.argmax(prob, 1), tf.argmax(self.y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) correct_num = tf.reduce_sum(tf.cast(correct_pred, tf.int32)) with tf.name_scope('summary'): localtime = time.strftime("%X %Y-%m-%d", time.localtime()) Summary_dir = 'Summary/' + localtime info = 'batch-{}, lr-{}, kb-{}, l2_reg-{}'.format( self.batch_size, self.learning_rate, self.Keep_Prob, self.l2_reg) info = info + '\ntrain_file_path:' + self.train_file_path + '\ntest_index:' + str(self.test_index) + '\nembedding_type:' + str(self.embedding_type) + '\nMethod: Emotion_GRU' summary_acc = tf.scalar_summary('ACC ' + info, accuracy) summary_loss = tf.scalar_summary('LOSS ' + info, cost) summary_op = tf.merge_summary([summary_loss, summary_acc]) test_acc = tf.placeholder(tf.float32) test_loss = tf.placeholder(tf.float32) summary_test_acc = tf.scalar_summary('ACC ' + info, test_acc) summary_test_loss = tf.scalar_summary('LOSS ' + info, test_loss) summary_test = tf.merge_summary( [summary_test_loss, summary_test_acc]) train_summary_writer = tf.train.SummaryWriter( Summary_dir + '/train') test_summary_writer = tf.train.SummaryWriter(Summary_dir + '/test') with tf.name_scope('saveModel'): saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) save_dir = 'Models/' + localtime + '/' if not os.path.exists(save_dir): os.makedirs(save_dir) with tf.name_scope('readData'): print '----------{}----------'.format(time.strftime("%Y-%m-%d %X", time.localtime())) tr_x, tr_y, tr_doc_len, te_x, te_y, te_doc_len, ev_x, ev_y, ev_doc_len= load_data_for_Emotion_CNN( self.train_file_path, self.word_id_mapping, self.max_doc_len, self.test_index, self.n_class ) print 'train docs: {} test docs: {}'.format(len(tr_y), len(te_y)) print 'training_iter:', self.training_iter print info print '----------{}----------'.format(time.strftime("%Y-%m-%d %X", time.localtime())) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session() as sess: sess.run(tf.initialize_all_variables()) max_acc, bestIter = 0., 0 def test(): acc, loss, cnt = 0., 0., 0 for test, num in self.get_batch_data(te_x, te_y, te_doc_len, 20, keep_prob=1.0): _loss, _acc = sess.run([cost, correct_num], feed_dict=test) acc += _acc loss += _loss * num cnt += num loss = loss / cnt acc = acc / cnt return loss, acc def new_test(): feed_dict = { self.x: ev_x, self.doc_len: ev_doc_len, self.keep_prob: 1.0, } y_true = ev_y y_pred_p = sess.run(prob, feed_dict=feed_dict) # y_pred = np.ceil(y_pred_p-1.0/8) y_pred = calibrated_label_ranking(y_pred_p) Emotion_eval(y_true, y_pred, y_pred_p) if self.training_iter==0: saver.restore(sess, 'Models/10:01:44 2017-03-11/-856') loss, acc=test() print loss,acc new_test() for i in xrange(self.training_iter): for train, _ in self.get_batch_data(tr_x, tr_y, tr_doc_len, self.batch_size, self.Keep_Prob): _, step, summary, loss, acc = sess.run( [optimizer, global_step, summary_op, cost, accuracy], feed_dict=train) train_summary_writer.add_summary(summary, step) print 'Iter {}: mini-batch loss={:.6f}, acc={:.6f}'.format(step, loss, acc) if i % self.display_step == 0: loss, acc=test() if acc > max_acc: max_acc = acc bestIter = step saver.save(sess, save_dir, global_step=step) new_test() summary = sess.run(summary_test, feed_dict={ test_loss: loss, test_acc: acc}) test_summary_writer.add_summary(summary, step) print '----------{}----------'.format(time.strftime("%Y-%m-%d %X", time.localtime())) print 'Iter {}: test loss={:.6f}, test acc={:.6f}'.format(step, loss, acc) print 'round {}: max_acc={} BestIter={}\n'.format(i, max_acc, bestIter) print 'Optimization Finished!'
def training_loop(graph, train_dir, num_training_steps=10000, summary_frequency=10, steps_to_average=20): """A generator which runs training steps at each output. Args: graph: A tf.Graph object containing the model. train_dir: A string path to the directory to write training checkpoints and summary events. num_training_steps: Generator terminates after this many steps. summary_frequency: How many training iterations to run per generator iteration. steps_to_average: Average accuracy has a moving window. This is the size of that window. Yields: A dict of training metrics, and runs summary_frequency training steps between each yield. """ cross_entropy = graph.get_collection('cross_entropy')[0] log_perplexity = graph.get_collection('log_perplexity')[0] accuracy = graph.get_collection('accuracy')[0] global_step = graph.get_collection('global_step')[0] learning_rate = graph.get_collection('learning_rate')[0] training_op = graph.get_collection('training_op')[0] checkpoint_file = os.path.join(train_dir, 'basic_rnn.ckpt') with graph.as_default(): summary_op = tf.merge_summary([ tf.scalar_summary('cross_entropy_loss', cross_entropy), tf.scalar_summary('log_perplexity', log_perplexity), tf.scalar_summary('learning_rate', learning_rate), tf.scalar_summary('accuracy', accuracy), tf.scalar_summary('global_step', global_step) ]) saver = tf.train.Saver() init_op = tf.initialize_all_variables() # Run training loop. session = tf.Session(graph=graph) summary_writer = tf.train.SummaryWriter(train_dir, session.graph) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=session, coord=coord) step = 0 gs = 0 logging.info('Starting training loop') try: accuracies = collections.deque(maxlen=steps_to_average) session.run(init_op) while gs < num_training_steps: step += 1 ce, lp, a, gs, lr, serialized_summaries, _ = session.run([ cross_entropy, log_perplexity, accuracy, global_step, learning_rate, summary_op, training_op ]) summary_writer.add_summary(serialized_summaries, global_step=gs) accuracies.append(a) if step % summary_frequency == 0: saved_path = saver.save(session, checkpoint_file, global_step=gs) logging.info('Wrote checkpoint to %s', saved_path) summary_writer.flush() avg_accuracy = sum(accuracies) / len(accuracies) logging.info( 'Global Step: %s - Loss: %.3f - ' 'Log-perplexity: %.3f - Step Accuracy: %.2f - ' 'Avg Accuracy (last %d summaries): %.2f - ' 'Learning Rate: %f', '{:,}'.format(gs), ce, lp, a, steps_to_average, avg_accuracy, lr) yield { 'step': step, 'global_step': gs, 'loss': ce, 'log_perplexity': lp, 'accuracy': a, 'average_accuracy': avg_accuracy, 'learning_rate': lr } saver.save(session, train_dir, global_step=gs) except tf.errors.OutOfRangeError as e: logging.warn('Got error reported to coordinator: %s', e) finally: try: coord.request_stop() summary_writer.close() except RuntimeError as e: logging.warn('Got runtime error: %s', e)