def after_run(self, run_context, run_values): predictions_val = run_values.results['predictions'] labels_val = run_values.results['labels'] step = run_values.results['step'] hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) self.writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), step) self.writer.add_summary(utils.MakeSummary("model/Training_Perr", perr), step) self.writer.add_summary(utils.MakeSummary("model/Training_GAP", gap), step)
def model_fn(features, labels, mode, params): is_training = mode == learn.ModeKeys.TRAIN optimizer_class = find_class_by_name(params.optimizer, [tf.train]) label_loss_fn = find_class_by_name(params.label_loss, [losses])() model = find_class_by_name(params.model, [frame_level_models, video_level_models])() global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( params.base_learning_rate, global_step * params.batch_size * params.num_towers, params.learning_rate_decay_examples, params.learning_rate_decay, staircase=True, ) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) tf.summary.histogram("model/input_raw", features['model_input']) feature_dim = len(features['model_input'].get_shape()) - 1 model_input = tf.nn.l2_normalize(features['model_input'], feature_dim) tower_inputs = tf.split(model_input, params.num_towers) if mode == learn.ModeKeys.INFER: # *** # this is a quick hack so that the existing model_fn code, # taken from train.py, doesn't break in inference (or serving) mode. # Normally, we would write model_fn such that the 'labels' input arg # can be None in inference mode, but this existing model code was not written this # way. See the serving_input_fn() defined below, to see where 'labels_batch' # is added to the features dict, just to make this code work properly labels = features['labels_batch'] tower_labels = tf.split(labels, params.num_towers) tower_num_frames = tf.split(features['num_frames'], params.num_towers) tower_gradients = [] tower_predictions = [] tower_label_losses = [] tower_reg_losses = [] for i in range(params.num_towers): # For some reason these 'with' statements can't be combined onto the same # line. They have to be nested. with tf.device(params.device_string % i): with (tf.variable_scope(("tower"), reuse=True if i > 0 else None)): with (slim.arg_scope([slim.model_variable, slim.variable], device="/cpu:0" if params.num_gpus != 1 else "/gpu:0")): result = model.create_model( tower_inputs[i], num_frames=tower_num_frames[i], vocab_size=params.reader.num_classes, labels=tower_labels[i], is_training=is_training) for variable in slim.get_model_variables(): tf.summary.histogram(variable.op.name, variable) predictions = result["predictions"] tower_predictions.append(predictions) if "loss" in result.keys(): label_loss = result["loss"] else: label_loss = label_loss_fn.calculate_loss( predictions, tower_labels[i]) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) tower_reg_losses.append(reg_loss) # Adds update_ops (e.g., moving average updates in batch normalization) as # a dependency to the train_op. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) tower_label_losses.append(label_loss) final_loss = params.regularization_penalty * reg_loss + label_loss gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) tower_gradients.append(gradients) pred_dict = {} label_loss = tf.reduce_mean(tf.stack(tower_label_losses)) predictions = tf.concat(tower_predictions, 0) pred_dict['predictions'] = predictions tf.summary.scalar("label_loss", label_loss) if params.regularization_penalty != 0: reg_loss = tf.reduce_mean(tf.stack(tower_reg_losses)) tf.summary.scalar("reg_loss", reg_loss) if is_training: # Incorporate the L2 weight penalties, etc. merged_gradients = utils.combine_gradients(tower_gradients) if params.clip_gradient_norm > 0: with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms( merged_gradients, params.clip_gradient_norm) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) else: train_op = None eval_metric_ops = {} if mode == learn.ModeKeys.EVAL or is_training: eval_metric_ops['hit_at_one'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32(eval_util.calculate_hit_at_one(x, y)), [predictions, labels], tf.float32, stateful=False, )) eval_metric_ops['perr'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32( eval_util.calculate_precision_at_equal_recall_rate(x, y)), [predictions, labels], tf.float32, stateful=False, )) eval_metric_ops['gap'] = metrics.streaming_mean( tf.py_func( lambda x, y: np.float32(eval_util.calculate_gap(x, y)), [predictions, labels], tf.float32, stateful=False, )) else: pass top_predictions, top_indices = tf.nn.top_k(predictions, _TOP_PREDICTIONS_IN_OUTPUT) pred_dict['top_predictions'] = top_predictions pred_dict['top_indices'] = top_indices #add eval summaries and update ops for training for key, val in eval_metric_ops.items(): tf.summary.scalar(key, val[0]) #create summary for each eval op tf.add_to_collection( tf.GraphKeys.UPDATE_OPS, val[1] ) # add the update op for each eval up to update ops collection, so that it will be run every train_op call # tf.add_to_collection("global_step", global_step) # tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", tf.concat(tower_predictions, 0)) # tf.add_to_collection("input_batch_raw", model_input_raw) # tf.add_to_collection("input_batch", model_input) # tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels, tf.float32)) # tf.add_to_collection("train_op", train_op) tf.summary.scalar("loss", label_loss) export_outputs = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(pred_dict) } return tf.estimator.EstimatorSpec(mode=mode, predictions=pred_dict, loss=label_loss, train_op=train_op, export_outputs=export_outputs, eval_metric_ops=eval_metric_ops)
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() if FLAGS.ema_source: # Here the variables still exsist ema_op = tf.get_collection("ema_op")[0] def_vars = tf.get_collection("updatable_vars") ema_vars = tf.get_collection("ema_vars") sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: if FLAGS.ema_source: logging.info("%s: Entering training loop.", task_as_string(self.task)) sess.graph._unsafe_unfinalize() ckpt_reader = pywrap_tensorflow.NewCheckpointReader(FLAGS.ema_source) for xtensor, ematensor in zip(def_vars, ema_vars): src_tensor = ckpt_reader.get_tensor(xtensor.name.split(":")[0]) # Loading does not take up graoh space xtensor.load(src_tensor, session=sess) ematensor.load(src_tensor, session=sess) # sess.run(tf.assign(xtensor, src_tensor)) # sess.run(tf.assign(ematensor, src_tensor)) try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) if FLAGS.ema_source: # Update EMA if needed _ = sess.run(ema_op) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: # self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: save_name = "{0}/model.ckpt".format(self.train_dir) saver.save(sess, save_name, global_step_val) logging.info("Final model export.") logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() latest_checkpoint, meta_filename = self.get_meta_filename( start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: if not FLAGS.change_file: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) if FLAGS.change_file: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=60 * FLAGS.time_to_save_model, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) logging.info("TANG:restoring") saver.restore(sess, latest_checkpoint) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % FLAGS.eval_loop == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model() global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while not sv.should_stop(): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time if self.is_master: examples_per_second = labels_val.shape[ 0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info( "%s: training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] input_batch_raw = tf.get_collection("input_batch_raw")[0] input_batch = tf.get_collection("input_batch")[0] model_input_raw_ph = tf.get_collection("model_input_raw_ph")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] num_frames = tf.get_collection("num_frames")[0] num_frames_ph = tf.get_collection("num_frames_ph")[0] learning_rate = tf.get_collection("learning_rate")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() model_input_raw_val, num_frames_val, learning_rate_val = sess.run( [input_batch_raw, num_frames, learning_rate]) pr_feature = [] pr_num = [] for i in range(model_input_raw_val.shape[0]): if num_frames_val[i] / FLAGS.max_scene <= 2: num_tmp = num_frames_val[i] * np.ceil( 1 + FLAGS.max_scene / num_frames_val[i]) input_tmp = model_input_raw_val[ i][:num_frames_val[i]] input_tmp = np.repeat( input_tmp, np.ceil(1 + FLAGS.max_scene / num_frames_val[i]), 0) else: num_tmp = num_frames_val[i] input_tmp = model_input_raw_val[i][:num_tmp] numvec = (input_tmp[1:] * input_tmp[:-1]).sum(axis=1) / (np.sqrt( (input_tmp[1:]**2).sum(1)) * (np.sqrt( (input_tmp[:-1]**2).sum(1)))) idx = np.sort( numvec.argpartition(FLAGS.max_scene - 1)[:FLAGS.max_scene - 1] + 1) example_splits = np.split(input_tmp, idx, 0) example_splits_mean = [ np.mean(example_split, 0) for example_split in example_splits ] example_splits_mean = np.stack(example_splits_mean, 0) pr_num.append(FLAGS.max_scene) pr_feature.append(example_splits_mean) pr_feature = np.stack(pr_feature, 0) pr_num = np.stack(pr_num, 0) _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels], feed_dict={ model_input_raw_ph: pr_feature, num_frames_ph: pr_num }) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() # meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: # if meta_filename: # saver = self.recover_model(meta_filename) with tf.device(device_fn): # if not meta_filename: self.build_model(self.model, self.reader, self.downloader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] # init_op = tf.global_variables_initializer() # saver = tf.train.Saver() # config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config,graph=graph) as sess: enqueue_thread = threading.Thread(target=self.downloader.enqueuing, args=(sess,), kwargs={'num_epochs': FLAGS.num_epochs}) enqueue_thread.daemon = True enqueue_thread.start() latest_checkpoint = tf.train.latest_checkpoint(self.train_dir) if latest_checkpoint: logging.info("Loading checkpoint for train: " + latest_checkpoint) # Restores from checkpoint saver.restore(sess, latest_checkpoint) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/yt8m_train/model.ckpt-0, extract global_step from it. # global_step_val = latest_checkpoint.split("/")[-1].split("-")[-1] else: logging.info("No checkpoint file found.") sess.run([tf.local_variables_initializer()]) logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) + " GAP: " + ("%.4f" % gap)) # sv.summary_writer.add_summary( # utils.MakeSummary("model/Training_Hit@1", hit_at_one), # global_step_val) # sv.summary_writer.add_summary( # utils.MakeSummary("model/Training_Perr", perr), global_step_val) # sv.summary_writer.add_summary( # utils.MakeSummary("model/Training_GAP", gap), global_step_val) # sv.summary_writer.add_summary( # utils.MakeSummary("global_step/Examples/Second", # examples_per_second), global_step_val) # sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, saver, self.train_dir, sess) self.last_model_export_step = global_step_val # else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.4f" % loss_val) + " Examples/sec: " + ("%.4f" % examples_per_second)) self.downloader.stop() logging.info('Donwloader stopped') logging.info("%s: Exited training loop.", task_as_string(self.task))
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model() global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() if FLAGS.dropout: keep_prob_tensor = tf.get_collection("keep_prob")[0] if FLAGS.noise_level > 0: noise_level_tensor = tf.get_collection("noise_level")[0] if FLAGS.reweight: weights_input, weights_assignment = None, None if len(tf.get_collection("weights_input")) > 0: weights_input = tf.get_collection("weights_input")[0] weights_assignment = tf.get_collection( "weights_assignment")[0] sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=6 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: # re-assign weights if FLAGS.reweight: optional_assign_weights(sess, weights_input, weights_assignment) try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while not sv.should_stop(): batch_start_time = time.time() custom_feed = {} if FLAGS.dropout: custom_feed[keep_prob_tensor] = FLAGS.keep_prob if FLAGS.noise_level > 0: custom_feed[noise_level_tensor] = FLAGS.noise_level _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels], feed_dict=custom_feed) seconds_per_batch = time.time() - batch_start_time if self.is_master: examples_per_second = labels_val.shape[ 0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) recall = "N/A" if False: recall = eval_util.calculate_recall_at_n( predictions_val, labels_val, FLAGS.recall_at_n) sv.summary_writer.add_summary( utils.MakeSummary( "model/Training_Recall@%d" % FLAGS.recall_at_n, recall), global_step_val) recall = "%.2f" % recall gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info( "%s: training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Recall@%d: " % FLAGS.recall_at_n + recall + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] input_batch_raw = tf.get_collection("input_batch_raw")[0] is_neg = tf.get_collection("is_negative")[0] init_op = tf.global_variables_initializer() if FLAGS.model == "EmbeddingModel": hidden_layer_activations = tf.get_collection("hidden_layer_activations")[0] sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) batch_counter = 0 while (not sv.should_stop()) and (not self.max_steps_reached): batch_counter += 1 batch_start_time = time.time() if FLAGS.model == "EmbeddingModel": _, global_step_val, loss_val, predictions_val, labels_val, input_batch_raw_val, embeddings, is_neg_val = sess.run( [train_op, global_step, loss, predictions, labels, input_batch_raw, hidden_layer_activations, is_neg]) else: _, global_step_val, loss_val, predictions_val, labels_val, input_batch_raw_val = sess.run( [train_op, global_step, loss, predictions, labels, input_batch_raw]) seconds_per_batch = time.time() - batch_start_time if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master: k = 10 examples_per_second = labels_val.shape[0] / seconds_per_batch predictions_val = predictions_val[:, 0:4716] hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) if FLAGS.model == "EmbeddingModel" \ "": logging.info(is_neg_val[1]) hit_emb = eval_util.calculate_hit_at_k_embedding(embeddings, k) logging.info(numpy.sum(numpy.multiply(embeddings[1, 0:FLAGS.embedding_size], embeddings[1, FLAGS.embedding_size:2 * FLAGS.embedding_size]))) logging.info("%s Training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " HitEmbedding@" + ("%.0f: " % k) + ( "%.2f" % hit_emb) + " GAP: " + ("%.2f" % gap) + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_HitEmbedding@10", hit_emb), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val if FLAGS.model == "EmbeddingModel": if FLAGS.image_server & (batch_counter == 9000): pred_audio = np.asarray(predictions_val[1, 0:FLAGS.embedding_size]) pred_frames = np.asarray(predictions_val[1, FLAGS.embedding_size:2 * FLAGS.embedding_size]) # plt.bar(range(1, 129), pred_audio / np.linalg.norm(pred_audio)) # plt.savefig("embedding_audio2.png") # plt.cla() # plt.bar(range(1, 129), pred_frames / np.linalg.norm(pred_frames)) # plt.savefig("embedding_frames2.png") # plt.cla() # plt.bar(range(1, 129), # pred_frames / np.linalg.norm(pred_frames) - pred_audio / np.linalg.norm(pred_audio)) # plt.savefig("embedding_diferencia2.png") # plt.cla() # logging.info("Imatges guardades") # Exporting the final model if self.is_master: self.export_model(global_step_val, sv.saver, sv.save_path, sess) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop() print("Hem acabat")
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if file_io.file_exists(flags_json_path): existing_flags = json.load( file_io.FileIO(flags_json_path, mode="r")) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with file_io.FileIO(flags_json_path, mode="w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model() global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() if FLAGS.dropout: keep_prob_tensor = tf.get_collection("keep_prob")[0] if FLAGS.noise_level > 0: noise_level_tensor = tf.get_collection("noise_level")[0] if FLAGS.reweight: weights_input, weights_assignment = None, None if len(tf.get_collection("weights_input")) > 0: weights_input = tf.get_collection("weights_input")[0] weights_assignment = tf.get_collection( "weights_assignment")[0] sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=FLAGS.keep_checkpoint_interval * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: # re-assign weights if FLAGS.reweight: optional_assign_weights(sess, weights_input, weights_assignment) steps = 0 try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while not sv.should_stop(): steps += 1 batch_start_time = time.time() custom_feed = {} if FLAGS.dropout: custom_feed[keep_prob_tensor] = FLAGS.keep_prob if FLAGS.noise_level > 0: custom_feed[noise_level_tensor] = FLAGS.noise_level _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels], feed_dict=custom_feed) seconds_per_batch = time.time() - batch_start_time if self.is_master: examples_per_second = labels_val.shape[ 0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) recall = "N/A" if False: recall = eval_util.calculate_recall_at_n( predictions_val, labels_val, FLAGS.recall_at_n) sv.summary_writer.add_summary( utils.MakeSummary( "model/Training_Recall@%d" % FLAGS.recall_at_n, recall), global_step_val) recall = "%.2f" % recall gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info( "%s: training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Recall@%d: " % FLAGS.recall_at_n + recall + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() if FLAGS.max_steps is not None and steps > FLAGS.max_steps: logging.info( "%s: Done training -- max_steps limit reached.", task_as_string(self.task)) break except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) logging.info( '############## PARAMETERS ##############################') logging.info("feature_names {}".format(FLAGS.feature_names)) logging.info("feature_sizes {}".format(FLAGS.feature_sizes)) logging.info("frame_features {}".format(FLAGS.frame_features)) logging.info("model {}".format(FLAGS.model)) logging.info("start_new_model {}".format(FLAGS.start_new_model)) logging.info("num_gpu {}".format(FLAGS.num_gpu)) logging.info("batch_size {}".format(FLAGS.batch_size)) logging.info("label_loss {}".format(FLAGS.label_loss)) logging.info("regularization_penalty {}".format( FLAGS.regularization_penalty)) logging.info("base_learning_rate {}".format(FLAGS.base_learning_rate)) logging.info("learning_rate_decay {}".format( FLAGS.learning_rate_decay)) logging.info("learning_rate_decay_examples {}".format( FLAGS.learning_rate_decay_examples)) logging.info("num_epochs {}".format(FLAGS.num_epochs)) logging.info("max_steps {}".format(FLAGS.max_steps)) logging.info("export_model_steps {}".format(FLAGS.export_model_steps)) logging.info("num_readers {}".format(FLAGS.num_readers)) logging.info("optimizer {}".format(FLAGS.optimizer)) logging.info("clip_gradient_norm {}".format(FLAGS.clip_gradient_norm)) logging.info( '########################################################') logging.info(' '.join([x for x in sys.argv])) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: with tf.device(device_fn): saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op, init_fn = None, None if meta_filename: saver = tf.train.Saver(tf.global_variables(), max_to_keep=0, keep_checkpoint_every_n_hours=0.25) def init_fn(sess): return saver.restore(sess, meta_filename) else: init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, init_fn=init_fn, is_chief=self.is_master, global_step=global_step, save_model_secs=40 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def main(): env = json.loads(os.environ.get("TF_CONFIG", "{}")) task_data = env.get("task", None) or {"type": "master", "index": 0} task = type("TaskSpec", (object, ), task_data) logging.set_verbosity(tf.logging.INFO) logging.info("%s: Tensorflow version: %s.", task_as_string(task), tf.__version__) video_ids, video_features, video_labels, video_frames = gen_input( data_pattern, reader_batch_size=reader_batch_size, num_classes=num_classes, num_readers=num_readers, mini_batch_size=mini_batch_size) result = gen_model(model_input=video_features, vocab_size=num_classes, labels=video_labels, num_frames=video_frames) predictions = result["predictions"] global_step = tf.Variable(0, trainable=False, name="global_step") label_loss = label_loss_fn.calculate_loss(predictions, video_labels) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) reg_losses = tf.losses.get_regularization_losses() if reg_losses: reg_loss += tf.add_n(reg_losses) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if "update_ops" in result.keys(): update_ops += result["update_ops"] if update_ops: with tf.control_dependencies(update_ops): barrier = tf.no_op(name="gradient_barrier") with tf.control_dependencies([barrier]): label_loss = tf.identity(label_loss) final_loss = regularization_penalty * reg_loss + label_loss learning_rate = tf.train.exponential_decay(base_learning_rate, global_step * mini_batch_size * num_towers, learning_rate_decay_examples, learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) optimizer = optimizer_class(learning_rate) gradients = optimizer.compute_gradients(final_loss, colocate_gradients_with_ops=False) tf.summary.scalar("label_loss", label_loss) tf.summary.scalar("reg_loss", reg_loss) if clip_gradient_norm > 0: with tf.name_scope('clip_grads'): gradients = utils.clip_gradient_norms(gradients, clip_gradient_norm) train_op = optimizer.apply_gradients(gradients, global_step=global_step) with tf.Session() as sess: tf.global_variables_initializer().run() tf.local_variables_initializer().run() #init_local_op = tf.local_variables_initializer() #sess.run(init_local_op) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) total_step = 0 try: while total_step < 100000: batch_start_time = time.time() # v_ids, v_features, v_labels, v_frames = sess.run([video_ids, video_features, video_labels, video_frames]) _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [ train_op, global_step, label_loss, predictions, tf.cast(video_labels, tf.float32) ]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch # if max_steps <= global_step_val: # max_steps_reached = True # print(v_features.shape) # print(v_ids) if total_step % 10 == 0: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) total_step = total_step + 1 except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(task)) coord.request_stop() coord.join(threads)
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] restart_learning_rate = tf.get_collection( "restart_learning_rate")[0] layers_keep_probs = tf.get_collection("layers_keep_probs")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] if FLAGS.use_ema == True: ema_op = tf.get_collection("ema_op")[0] e_loss = tf.get_collection("e_loss")[0] e_labels = tf.get_collection("e_labels")[0] e_predictions = tf.get_collection("e_predictions")[0] init_op = tf.global_variables_initializer() restart_op = tf.assign(restart_learning_rate, FLAGS.restart_learning_rate) # getting a proper number of keep_prob parameters for dropout # max is 10 and we have to pad the vector with 1s # not the nicest solution, but works tmp_layers = [] if FLAGS.layers_keep_probs is not None: tmp_layers = [ float(x) for x in FLAGS.layers_keep_probs.replace( ' ', '').split(',') ] tmp_layers_padded = tmp_layers + [ 1.0 for x in range(10 - len(tmp_layers)) ] with tf.variable_scope("tower", reuse=True) as scope: keep_op = tf.assign(layers_keep_probs, tmp_layers_padded) sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=FLAGS.save_model_minutes * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: if FLAGS.restart_learning_rate > 0.0: sess.run(restart_op) logging.info("restart learning rate: %f\n" % FLAGS.restart_learning_rate) if FLAGS.layers_keep_probs != "1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0": logging.info("============") sess.run(keep_op) logging.info("layers keep probabilites: %s" % FLAGS.layers_keep_probs) logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 50 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.4f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) #also do eval e_loss_val, e_predictions_val, e_labels_val = sess.run( [e_loss, e_predictions, e_labels]) e_hit_at_one = eval_util.calculate_hit_at_one( e_predictions_val, e_labels_val) e_perr = eval_util.calculate_precision_at_equal_recall_rate( e_predictions_val, e_labels_val) e_gap = eval_util.calculate_gap( e_predictions_val, e_labels_val) logging.info("training step " + str(global_step_val) + " | eLoss: " + ("%.2f" % e_loss_val) + " |eHit@1: " + ("%.2f" % e_hit_at_one) + " ePERR: " + ("%.2f" % e_perr) + " eGAP: " + ("%.4f" % e_gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Eval_Hit@1", e_hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Eval_Perr", e_perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Eval_GAP", e_gap), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = [] for filename in self.train_dir.split(','): logging.info("filename:%s", str(filename)) meta_filename.append( self.get_meta_filename(start_new_model, filename)) label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) local_device_protos = device_lib.list_local_devices() gpus = [x.name for x in local_device_protos if x.device_type == 'GPU'] num_gpus = len(gpus) if num_gpus > 0: logging.info("Using the following GPUs to train: " + str(gpus)) num_towers = num_gpus device_string = '/gpu:%d' else: logging.info("No GPUs found. Training on CPU.") num_towers = 1 device_string = '/cpu:%d' # build_graph_retrain( # reader=self.reader, # model=self.model, # train_data_pattern=FLAGS.train_data_pattern, # label_loss_fn=label_loss_fn, # num_readers=FLAGS.num_readers, # batch_size=FLAGS.batch_size) # with tf.variable_scope("net2"): #### global_step = tf.Variable(0, trainable=False, name="global_step") learning_rate = tf.train.exponential_decay( FLAGS.base_learning_rate, global_step * FLAGS.batch_size * num_towers, FLAGS.learning_rate_decay_examples, FLAGS.learning_rate_decay, staircase=True) tf.summary.scalar('learning_rate', learning_rate) video_id_batch, model_input_raw, labels_batch, num_frames = get_input_data_tensors( # pylint: disable=g-line-too-long self.reader, FLAGS.train_data_pattern, batch_size=FLAGS.batch_size, num_readers=FLAGS.num_readers) tf.summary.histogram("model_input_raw", model_input_raw) feature_dim = len(model_input_raw.get_shape()) - 1 # Normalize input features. model_input = tf.nn.l2_normalize(model_input_raw, feature_dim) # with tf.variable_scope("net1"): with tf.variable_scope("tower"): result1 = self.model[0].create_model( model_input, num_frames=num_frames, vocab_size=self.reader.num_classes, is_training=False) ##### result1 = tf.stop_gradient(result1) result2 = self.model[1].create_model( model_input, num_frames=num_frames, vocab_size=self.reader.num_classes, labels=labels_batch, is_training=False) result2 = tf.stop_gradient(result2) all_vars = tf.global_variables() # for v in all_vars: # print v.name # for i in v_vars: # logging.info(str(i)) for i, v in enumerate(all_vars): logging.info(str(v.name)) if 'rnn' in v.name: vars1 = all_vars[:i] vars2 = all_vars[i:] break # v_vars0 = [v for v in all_vars if v.name == 'tower/input_bn/beta:0' # or v.name == 'tower/input_bn/gamma:0' # or v.name == 'tower/input_bn/beta:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0'] # v_vars = [v for v in all_vars if v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0' # or v.name == 'tower/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0'] result1 = tf.nn.l2_normalize(result1, dim=1) result2 = tf.nn.l2_normalize(result2, dim=1) embeddings = tf.concat([result1, result2], axis=1) model_concat = find_class_by_name('MoeModel', [video_level_models])() result = model_concat.create_model( embeddings, vocab_size=self.reader.num_classes, num_mixtures=4) predictions = result["predictions"] # predictions=(result1["predictions"]+result2["predictions"])/2 tf.summary.histogram("model_activations", predictions) # if "loss" in result.keys(): # label_loss = result["loss"] # else: label_loss = label_loss_fn.calculate_loss(predictions, labels_batch) tf.summary.scalar("label_loss", label_loss) if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] reg_losses = tf.losses.get_regularization_losses() if "regularization_loss" in result.keys(): reg_loss = result["regularization_loss"] else: reg_loss = tf.constant(0.0) final_loss = FLAGS.regularization_penalty * reg_loss + label_loss optimizer = optimizer_class(learning_rate) gradients = optimizer.compute_gradients( final_loss, colocate_gradients_with_ops=False) with tf.name_scope('clip_grads'): merged_gradients = utils.clip_gradient_norms(gradients, 1.0) train_op = optimizer.apply_gradients(merged_gradients, global_step=global_step) tf.add_to_collection("global_step", global_step) tf.add_to_collection("loss", label_loss) tf.add_to_collection("predictions", predictions) tf.add_to_collection("input_batch", model_input) tf.add_to_collection("video_id_batch", video_id_batch) tf.add_to_collection("num_frames", num_frames) tf.add_to_collection("labels", tf.cast(labels_batch, tf.float32)) tf.add_to_collection("summary_op", tf.summary.merge_all()) tf.add_to_collection("train_op", train_op) video_id_batch = tf.get_collection("video_id_batch")[0] prediction_batch = tf.get_collection("predictions")[0] label_batch = tf.get_collection("labels")[0] loss = tf.get_collection("loss")[0] summary_op = tf.get_collection("summary_op")[0] # saver = tf.train.Saver(tf.global_variables()) # saver=tf.train.Saver(result1) summary_writer = tf.summary.FileWriter( FLAGS.ensemble_dir, graph=tf.get_default_graph()) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: train_dirs = FLAGS.train_dir.split(',') latest_checkpoint0 = tf.train.latest_checkpoint(train_dirs[0]) latest_checkpoint1 = tf.train.latest_checkpoint(train_dirs[1]) sess.run(tf.global_variables_initializer()) if latest_checkpoint0: logging.info("Loading checkpoint for eval: " + latest_checkpoint0) saver1 = tf.train.Saver(vars1) saver1.restore(sess, latest_checkpoint0) if latest_checkpoint1: saver2 = tf.train.Saver(vars2) logging.info("Loading checkpoint for eval: " + latest_checkpoint1) saver2.restore(sess, latest_checkpoint1) saver = tf.train.Saver() fetches = [ learning_rate, global_step, train_op, video_id_batch, prediction_batch, label_batch, loss, summary_op ] coord = tf.train.Coordinator() threads = [] for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS): threads.extend( qr.create_threads(sess, coord=coord, daemon=True, start=True)) while not coord.should_stop(): # batch_start_time = time.time() learning_rate_val, global_step_val, _, vid_val, predictions_val, labels_val, loss_val, summary_val = sess.run( fetches) # hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) # perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, # labels_val) # gap = eval_util.calculate_gap(predictions_val, labels_val) # logging.info( "training step " + str(global_step_val)+" | Loss: " + ("%.2f" % loss_val) +" | Hit@1: " + # ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) + # " GAP: " + ("%.4f" % gap)) if self.is_master and global_step_val % self.disp_batches == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + "| learning rate: " + ("%.4f" % learning_rate_val) + " | Loss: " + ("%.2f" % loss_val) + " | Hit@1: " + ("%.4f" % hit_at_one) + " PERR: " + ("%.4f" % perr) + " GAP: " + ("%.4f" % gap)) summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/loss", loss_val), global_step_val) summary_writer.add_summary( utils.MakeSummary("model/lr", learning_rate_val), global_step_val) summary_writer.flush() if global_step_val % FLAGS.export_model_steps == 0: saver.save(sess, FLAGS.ensemble_dir, global_step=global_step_val) coord.request_stop() coord.join(threads, stop_grace_period_secs=10)
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, } flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error("Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] labels_origin = tf.get_collection("original_labels")[0] sampling_distribution = tf.get_collection("sampling_probs")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, #save_model_secs=15 * 60, save_model_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600), #save_summaries_secs=120, save_summaries_secs=int(FLAGS.save_checkpoint_every_n_hour * 3600), saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val, labels_origin_val, sampling_distribution_val = sess.run( [train_op, global_step, loss, predictions, labels, labels_origin, sampling_distribution]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch ## check on the correctness of label sampling #if not np.all(labels_val[0]==labels_origin_val[0]): # print(np.where(labels_val[0])[0], " --- ", np.where(labels_origin_val[0])[0]) #print(np.all(labels_val == labels_origin_val, 1).mean()) #temp_bool = np.sum(labels_val, 1)<1 #if np.any(temp_bool): # print("\n\n\nFOUND!!", np.where(labels_val[temp_bool])[0], "\n\n\n") #print(sampling_distribution_val[:5, :5]) if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True #if self.is_master and global_step_val % 10 == 0 and self.train_dir: if self.is_master and global_step_val % FLAGS.validate_every_n_training_steps == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_origin_val) perr = eval_util.calculate_precision_at_equal_recall_rate(predictions_val, labels_origin_val) gap = eval_util.calculate_gap(predictions_val, labels_origin_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() with open(FLAGS.train_dir + '/global_step_{%d}_training_GAP_{%.6f}.txt' % (global_step_val, gap), 'w') as f: f.write('\n') # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: #logging.info("training step " + str(global_step_val) + " | Loss: " + #("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) continue except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor( graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master: examples_per_second = labels_val.shape[0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one(predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info( "%s: training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Loss: " + str(loss_val), task_as_string(self.task)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ((self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val # Exporting the final model if self.is_master: self.export_model(global_step_val, sv.saver, sv.save_path, sess) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def train_loop(train_dir=None, saver=None, is_chief=True, master="", start_supervisor_services=True): """Performs training on the currently defined tensorflow graph. Args: train_dir: Where to save the model checkpoints. saver: The class to use for serializing the graph variables. is_chief: Whether this worker is the primary worker (which is responsible for writing checkpoints and summaries), or an anonymous member of the flock. master: Which Tensorflow master to listen to. start_supervisor_services: Whether to start threads for writing summaries and checkpoints. Returns: A tuple of the training Hit@1 and the training PERR. """ global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] sv = tf.train.Supervisor(logdir=train_dir, is_chief=is_chief, global_step=global_step, save_model_secs=60, save_summaries_secs=60, saver=saver) sess = sv.prepare_or_wait_for_session( master, start_standard_services=start_supervisor_services, config=tf.ConfigProto(log_device_placement=False)) logging.info("prepared session") sv.start_queue_runners(sess) logging.info("started queue runners") try: logging.info("entering training loop") while not sv.should_stop(): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[0] / seconds_per_batch hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) logging.info("training step " + str(global_step_val) + "| Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap) + " Loss: " + str(loss_val)) if is_chief and global_step_val % 10 == 0 and train_dir: sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() except tf.errors.OutOfRangeError: logging.info("Done training -- epoch limit reached") logging.info("exited training loop") sv.Stop() return hit_at_one, perr
def inference(reader, train_dir, data_pattern, out_file_location, batch_size, top_k): with tf.Session() as sess, gfile.Open(out_file_location, "w+") as out_file: video_id_batch, video_batch, labels_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size) latest_checkpoint = tf.train.latest_checkpoint(train_dir) if latest_checkpoint is None: raise Exception("unable to find a checkpoint at location: %s" % train_dir) else: if FLAGS.check_point < 0: meta_graph_location = latest_checkpoint + ".meta" else: meta_graph_location = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) + ".meta" latest_checkpoint = FLAGS.train_dir + "/model.ckpt-" + str(FLAGS.check_point) logging.info("loading meta-graph: " + meta_graph_location) saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True) logging.info("restoring variables from " + latest_checkpoint) saver.restore(sess, latest_checkpoint) input_tensor = tf.get_collection("input_batch_raw")[0] num_frames_tensor = tf.get_collection("num_frames")[0] predictions_tensor = tf.get_collection("predictions")[0] # Workaround for num_epochs issue. def set_up_init_ops(variables): init_op_list = [] for variable in list(variables): if "train_input" in variable.name: init_op_list.append(tf.assign(variable, 1)) variables.remove(variable) init_op_list.append(tf.variables_initializer(variables)) return init_op_list sess.run(set_up_init_ops(tf.get_collection_ref( tf.GraphKeys.LOCAL_VARIABLES))) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_examples_processed = 0 start_time = time.time() sum_hit_at_one=0 try: while not coord.should_stop(): video_id_batch_val, video_batch_val,label_val, num_frames_batch_val = sess.run([video_id_batch, video_batch, labels_batch, num_frames_batch]) predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, num_frames_tensor: num_frames_batch_val}) now = time.time() hit_at_one = eval_util.calculate_hit_at_one(predictions_val,label_val) num_examples_processed += len(video_batch_val) num_classes = predictions_val.shape[1] sum_hit_at_one += hit_at_one * len(video_batch_val) logging.info("num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format(now-start_time)) logging.info('------------------final performance----------------'+("%.2f" % hit_at_one)) for line in format_lines(video_id_batch_val, predictions_val, top_k,label_val): out_file.write(line) out_file.flush() except tf.errors.OutOfRangeError: logging.info('Done with inference. The output file was written to ' + out_file_location) accuracy_performance=sum_hit_at_one / num_examples_processed logging.info('------------------final performance----------------'+("%.2f" % accuracy_performance)) finally: coord.request_stop() coord.join(threads) sess.close()
def run(self, start_new_model=False): """Performs training on the currently defined Tensorflow graph. Returns: A tuple of the training Hit@1 and the training PERR. """ if self.is_master and start_new_model: self.remove_training_directory(self.train_dir) if not os.path.exists(self.train_dir): os.makedirs(self.train_dir) model_flags_dict = { "model": FLAGS.model, "feature_sizes": FLAGS.feature_sizes, "feature_names": FLAGS.feature_names, "frame_features": FLAGS.frame_features, "label_loss": FLAGS.label_loss, "netvlad_cluster_size": FLAGS.netvlad_cluster_size, "netvlad_hidden_size": FLAGS.netvlad_hidden_size, "moe_l2": FLAGS.moe_l2, "iterations": FLAGS.iterations, "netvlad_relu": FLAGS.netvlad_relu, "gating": FLAGS.gating, "moe_num_mixtures": FLAGS.moe_num_mixtures, "moe_prob_gating": FLAGS.moe_prob_gating, } # model_flags_dict = {} # for k in FLAGS.__flags: # model_flags_dict[k] = FLAGS[k].value flags_json_path = os.path.join(FLAGS.train_dir, "model_flags.json") if os.path.exists(flags_json_path): existing_flags = json.load(open(flags_json_path)) if existing_flags != model_flags_dict: logging.error( "Model flags do not match existing file %s. Please " "delete the file, change --train_dir, or pass flag " "--start_new_model", flags_json_path) logging.error("Ran model with flags: %s", str(model_flags_dict)) logging.error("Previously ran with flags: %s", str(existing_flags)) exit(1) else: # Write the file. with open(flags_json_path, "w") as fout: fout.write(json.dumps(model_flags_dict)) target, device_fn = self.start_server_if_distributed() meta_filename = self.get_meta_filename(start_new_model, self.train_dir) with tf.Graph().as_default() as graph: if meta_filename: saver = self.recover_model(meta_filename) with tf.device(device_fn): if not meta_filename: saver = self.build_model(self.model, self.reader) global_step = tf.get_collection("global_step")[0] loss = tf.get_collection("loss")[0] predictions = tf.get_collection("predictions")[0] labels = tf.get_collection("labels")[0] train_op = tf.get_collection("train_op")[0] init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(graph, logdir=self.train_dir, init_op=init_op, is_chief=self.is_master, global_step=global_step, save_model_secs=15 * 60, save_summaries_secs=120, saver=saver) logging.info("%s: Starting managed session.", task_as_string(self.task)) with sv.managed_session(target, config=self.config) as sess: try: logging.info("%s: Entering training loop.", task_as_string(self.task)) while (not sv.should_stop()) and (not self.max_steps_reached): batch_start_time = time.time() _, global_step_val, loss_val, predictions_val, labels_val = sess.run( [train_op, global_step, loss, predictions, labels]) seconds_per_batch = time.time() - batch_start_time examples_per_second = labels_val.shape[ 0] / seconds_per_batch if self.max_steps and self.max_steps <= global_step_val: self.max_steps_reached = True if self.is_master and global_step_val % 10 == 0 and self.train_dir: eval_start_time = time.time() hit_at_one = eval_util.calculate_hit_at_one( predictions_val, labels_val) perr = eval_util.calculate_precision_at_equal_recall_rate( predictions_val, labels_val) gap = eval_util.calculate_gap(predictions_val, labels_val) eval_end_time = time.time() eval_time = eval_end_time - eval_start_time logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second) + " | Hit@1: " + ("%.2f" % hit_at_one) + " PERR: " + ("%.2f" % perr) + " GAP: " + ("%.2f" % gap)) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Hit@1", hit_at_one), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_Perr", perr), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("model/Training_GAP", gap), global_step_val) sv.summary_writer.add_summary( utils.MakeSummary("global_step/Examples/Second", examples_per_second), global_step_val) sv.summary_writer.flush() # Exporting the model every x steps time_to_export = ( (self.last_model_export_step == 0) or (global_step_val - self.last_model_export_step >= self.export_model_steps)) if self.is_master and time_to_export: self.export_model(global_step_val, sv.saver, sv.save_path, sess) self.last_model_export_step = global_step_val else: logging.info("training step " + str(global_step_val) + " | Loss: " + ("%.2f" % loss_val) + " Examples/sec: " + ("%.2f" % examples_per_second)) except tf.errors.OutOfRangeError: logging.info("%s: Done training -- epoch limit reached.", task_as_string(self.task)) logging.info("%s: Exited training loop.", task_as_string(self.task)) sv.Stop()
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): global_step = tf.Variable(0, trainable=False) video_id, labels, rgb, audio, num_frames = readers.input(True) coord = tf.train.Coordinator() lr = tf.train.exponential_decay(FLAGS.lr, global_step, FLAGS.decay_steps, FLAGS.learning_decay_rate, staircase=True) tf.summary.scalar('learning_rate', lr) opt = tf.train.AdamOptimizer(lr, beta1=FLAGS.beta1, beta2=FLAGS.beta2, epsilon=1e-08, use_locking=False, name='Adam') grads = inference(rgb, audio, num_frames, label=labels, train=True) loss = tf.get_collection("loss")[0] predict = tf.get_collection("predict")[0] tvars = tf.trainable_variables() for var in tvars: tf.summary.histogram(var.op.name, var) for grad, var in grads: print var.op.name if grad is not None and type(grad) is not tf.IndexedSlices: tf.summary.histogram(var.op.name + '/gradients', grad) elif type(grad) is tf.IndexedSlices: print "This is a indexslice gradient" print grad.dense_shape else: print "There is a None gradient" apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variables_averages_op) saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.intra_op_parallelism_threads = 10 config.inter_op_parallelism_threads = 16 sess = tf.Session(config=config) sess.run(init) tf.train.start_queue_runners(sess=sess, coord=coord) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if not FLAGS.new_model: saver.restore(sess, ckpt.model_checkpoint_path) #loader.restore(sess, ckpt.model_checkpoint_path) for step in xrange(FLAGS.max_steps): start_time = time.time() _, loss_value, predict_value, labels_value, num_frames_value = sess.run( [train_op, loss, predict, labels, num_frames]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) hit_at_one = eval_util.calculate_hit_at_one( predict_value, labels_value) perr = eval_util.calculate_precision_at_equal_recall_rate( predict_value, labels_value) gap = eval_util.calculate_gap(predict_value, labels_value) format_str = ( '%s: step %d, loss = %.2f, hit@one = %.2f, perr = %.2f, gap = %.2f, (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, hit_at_one, perr, gap, examples_per_sec, sec_per_batch)) if step % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) summary_writer.add_summary( utils.MakeSummary("Hit@1", hit_at_one), step) summary_writer.add_summary(utils.MakeSummary("Perr", perr), step) summary_writer.add_summary(utils.MakeSummary("Gap", gap), step) summary_writer.add_summary( utils.MakeSummary("example per second", examples_per_sec), step) if (step % 1000 == 0 and step != 0) or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)