def estimator_dump(self): """In estimator mode. estim_spec = tf.estimator.EstimatorSpec(traing_hooks=[estimator_dump()]) :return: """ from tensorflow.python import debug as tf_debug self._init() return tf_debug.DumpingDebugHook(cfg.TF_DEBUG_DUMP_DIR)
def main(unused_argv): del unused_argv # Unused tf.logging.set_verbosity(tf.logging.INFO) assert FLAGS.seq_len > 0 assert FLAGS.perm_size > 0 FLAGS.n_token = data_utils.VOCAB_SIZE tf.logging.info("n_token {}".format(FLAGS.n_token)) if not tf.gfile.Exists(FLAGS.model_dir): tf.gfile.MakeDirs(FLAGS.model_dir) # Get train input function train_input_fn, train_record_info_dict = get_input_fn("train") tf.logging.info("num of batches {}".format( train_record_info_dict["num_batch"])) # Get train cache function train_cache_fn = get_cache_fn(FLAGS.mem_len) ##### Get model function model_fn = get_model_fn() ##### Create TPUEstimator # TPU Configuration run_config = model_utils.configure_tpu(FLAGS) # TPU Estimator estimator = tpu_estimator.TPUEstimator( model_fn=model_fn, train_cache_fn=train_cache_fn, use_tpu=FLAGS.use_tpu, config=run_config, params={"track_mean": FLAGS.track_mean}, train_batch_size=FLAGS.train_batch_size, eval_on_tpu=FLAGS.use_tpu) hooks = None if FLAGS.debug: if FLAGS.debug_dump_dir: hooks = [tf_debug.DumpingDebugHook(FLAGS.debug_dump_dir)] else: hooks = [tf_debug.LocalCLIDebugHook()] #### Training estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps, hooks=hooks)
def process(self, query): """Returns the visualizations for query. Args: query: The query to process. Returns: A dictionary of results with processing and graph visualizations. """ tf.logging.info("Processing new query [%s]" %query) # Create the new TFDBG hook directory. hook_dir = "/tmp/t2t_server_dump/request_%d" %int(time.time()) os.makedirs(hook_dir) hooks = [tfdbg.DumpingDebugHook(hook_dir, watch_fn=topk_watch_fn)] # TODO(kstevens): This is extremely hacky and slow for responding to # queries. Figure out a reasonable way to pre-load the model weights before # forking and run queries through the estimator quickly. def server_input_fn(): """Generator that returns just the current query.""" for _ in range(1): input_ids = self.source_vocab.encode(query) input_ids.append(text_encoder.EOS_ID) x = [1, 100, len(input_ids)] + input_ids x += [0] * (self.const_array_size - len(x)) d = { "inputs": np.array(x).astype(np.int32), "problem_choice": np.array(0).astype(np.int32) } yield d def input_fn(): """Generator that returns just the current query.""" gen_fn = decoding.make_input_fn_from_generator(server_input_fn()) example = gen_fn() # TODO(kstevens): Make this method public # pylint: disable=protected-access return decoding._interactive_input_tensor_to_features_dict( example, self.hparams) # Make the prediction for the current query. result_iter = self.estimator.predict(input_fn, hooks=hooks) result = None for result in result_iter: break # Extract the beam search information by reading the dumped TFDBG event # tensors. We first read and record the per step beam sequences then record # the beam scores. Afterwards we align the two sets of values to create the # full graph vertices and edges. decoding_graph = graph.Graph() run_dirs = sorted(glob.glob(os.path.join(hook_dir, "run_*"))) for run_dir in run_dirs: # Record the different completed and active beam sequence ids. alive_sequences = deque() finished_sequences = deque() # Make the root vertex since it always needs to exist. decoding_graph.get_vertex(sequence_key([0])) # Create the initial vertices and edges for the active and finished # sequences. We uniquely define each vertex using it's full sequence path # as a string to ensure there's no collisions when the same step has two # instances of an output id. dump_dir = tfdbg.DebugDumpDir(run_dir, validate=False) seq_datums = dump_dir.find(predicate=seq_filter) for seq_datum in seq_datums: sequences = np.array(seq_datum.get_tensor()).astype(int)[0] if "alive" in seq_datum.node_name: alive_sequences.append(sequences) if "finished" in seq_datum.node_name: finished_sequences.append(sequences) for sequence in sequences: pieces = self.targets_vocab.decode_list(sequence) index = sequence[-1] if index == 0: continue parent = decoding_graph.get_vertex(sequence_key(sequence[:-1])) current = decoding_graph.get_vertex(sequence_key(sequence)) edge = decoding_graph.add_edge(parent, current) edge.data["label"] = pieces[-1] edge.data["label_id"] = index # Coerce the type to be a python bool. Numpy bools can't be easily # converted to JSON. edge.data["completed"] = bool(index == 1) # Examine the score results and store the scores with the associated edges # in the graph. We fetch the vertices (and relevant edges) by looking # into the saved beam sequences stored above. score_datums = dump_dir.find(predicate=scores_filter) for score_datum in score_datums: if "alive" in score_datum.node_name: sequences = alive_sequences.popleft() if "finished" in score_datum.node_name: sequences = finished_sequences.popleft() scores = np.array(score_datum.get_tensor()).astype(float)[0] for i, score in enumerate(scores): sequence = sequences[i] if sequence[-1] == 0: continue vertex = decoding_graph.get_vertex(sequence_key(sequence)) edge = decoding_graph.edges[vertex.in_edges[0]] edge.data["score"] = score edge.data["log_probability"] = score edge.data["total_log_probability"] = score # Delete the hook dir to save disk space shutil.rmtree(hook_dir) # Create the graph visualization data structure. graph_vis = { "visualization_name": "graph", "title": "Graph", "name": "graph", "search_graph": decoding_graph.to_dict(), } # Create the processing visualization data structure. # TODO(kstevens): Make this method public # pylint: disable=protected-access output_ids = decoding._save_until_eos(result["outputs"].flatten(), False) output_pieces = self.targets_vocab.decode_list(output_ids) output_token = [{"text": piece} for piece in output_pieces] output = self.targets_vocab.decode(output_ids) source_steps = [{ "step_name": "Initial", "segment": [{ "text": query }], }] target_steps = [{ "step_name": "Initial", "segment": output_token, }, { "step_name": "Final", "segment": [{ "text": output }], }] processing_vis = { "visualization_name": "processing", "title": "Processing", "name": "processing", "query_processing": { "source_processing": source_steps, "target_processing": target_steps, }, } return { "result": [processing_vis, graph_vis], }
train_monitors=hooks, eval_hooks=hooks) ex.train() accuracy_score = ex.evaluate()["accuracy"] # In[ ]: #python -m tensorflow.python.debug.examples.debug_tflearn_iris --use_experiment --debug # In[ ]: # Let your BUILD target depend on "//tensorflow/python/debug:debug_py # (You don't need to worry about the BUILD dependency if you are using a pip # install of open-source TensorFlow.) from tensorflow.python import debug as tf_debug hooks = [tf_debug.DumpingDebugHook("/shared/storage/location/tfdbg_dumps_1")] # In[ ]: #python -m tensorflow.python.debug.cli.offline_analyzer \ # --dump_dir="/shared/storage/location/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>"
def main(): tf.logging.set_verbosity(tf.logging.INFO) # Path to file specifying all runtime and model parameters and how to process user command line input. config_file_path = os.path.join(PROJECT_MODEL_ROOT, "configs/default.json") # Argparse namespace combining json defaults and user command line inputs args = estimator_utils.init_basic_argument_parser(config_file_path) # Transfer all k:v pairs from the Argparse namespace to HParams hparams = tf.contrib.training.HParams(**vars(args)) # Print stats about the current run print_run_info(args) # Calculate the number of steps needed to complete one epoch for each of the subsets steps_in_epoch_train = np.ceil(args.num_samples["train"] / args.train_batch_size) steps_in_epoch_val = np.ceil(args.num_samples["validation"] / args.validation_batch_size) # Number of training steps to perform during train_and_evaluate total_train_steps = int(steps_in_epoch_train * args.num_epochs) # Minimum number of steps during which no early stopping can occur train_steps_without_stopping = steps_in_epoch_train * args.train_epochs_without_stopping # Number of steps during which no metric improvement happened that is needed to initiate early stopping max_train_steps_without_improvement = int( steps_in_epoch_train * args.max_train_epochs_without_improvement) # Number of evaluation steps that are performed during each of the calls to evaluation during train_and_evaluate eval_steps_during_train = int(steps_in_epoch_val * args.eval_pc_during_train) # Number of steps during which evaluation is not performed train_steps_without_evaluation = int(steps_in_epoch_train * args.delay_evaluation_epochs) throttle_secs = args.save_checkpoints_secs save_checkpoints_steps = None # Only one of secs and steps for checkpointing frequency is allowed to be saved assert (args.save_checkpoints_secs is not None) ^ (args.checkpoint_freq_epochs is not None) if args.checkpoint_freq_epochs is not None: save_checkpoints_steps = np.ceil( steps_in_epoch_train * args.checkpoint_freq_epochs) # TODO Ensure this is never zero throttle_secs = 1 # Number of towers num_shards = args.num_gpu if args.num_gpu > 0 else 1 # Path object pointing to the location where the checkpoints and results are saved # If model path is provided then load a previously instantiated model and train/evaluate # using the previous values. folder_naming_vars = [] for x in args.folder_naming_vars: folder_naming_vars.append( eval(x)) # For some reason list comprehension doesn't work execution_date = time.strftime("%Y%b%d", time.localtime( )) if args.execution_date is None else args.execution_date # Sagemaker provides model_dir or when running elsewhere creates new model_dir or loads previous run via model_path if hparams.model_dir is None: model_dir = retrieve_model_dir(args.log_dir_path, args.model_path, execution_date, *folder_naming_vars) hparams.set_hparam("model_dir", model_dir) setattr(args, "model_dir", model_dir) # Path pointing to the location of the current data set (e.g. .../numpy/lastfm_10_pc) data_dir = os.path.join( args.data_dir_path if args.data_dir_path else "", "" if args.exec_loc == "sagemaker" else args.dataset, "tfrecords" if args.input_data_format == "tfrecords" else "", "sharded" if args.exec_loc == "sagemaker" else "") # Tensorflow device allocation settings config_proto = tf.ConfigProto( allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement) config_proto.gpu_options.allow_growth = True # Object specifying current run settings e.g. logging frequency and num of check points saved. run_config = tf.estimator.RunConfig( tf_random_seed=args.tf_random_seed, model_dir=args.model_dir, session_config=config_proto, save_summary_steps=20, save_checkpoints_steps=save_checkpoints_steps if not args.overwrite else 1, save_checkpoints_secs=args.save_checkpoints_secs, keep_checkpoint_max=args.keep_checkpoint_max, log_step_count_steps=100, ) # Instantiate an Estimator object with the model_fn from this module. estimator = estimator_model.create_estimator(run_config, hparams) # The degree of shuffling - int. Check tf.Data.dataset.shuffle() for additional documentation. shuffle_train = int(args.num_samples["train"] * args.shuffle_train) if args.shuffle_train else 1 shuffle_val = int(args.num_samples["val"] * args.shuffle_test) if args.shuffle_test else 1 additional_arrays = ["weights"] if args.use_weights else [] # https://cloud.google.com/blog/products/gcp/easy-distributed-training-with-tensorflow-using-tfestimatortrain-and-evaluate-on-cloud-ml-engine with tf.name_scope("TrainSpec_and_hook"): with tf.name_scope("Early_stop_hook"): try: os.makedirs(estimator.eval_dir()) except FileExistsError: pass training_hooks = [] early_stopping_hook = estimator_utils.make_early_stopping_hook( estimator=estimator, metric_name=args.key_metrics[0], max_train_steps_without_improvement= max_train_steps_without_improvement, min_steps=train_steps_without_stopping, run_every_secs=None, run_every_steps=1) if args.early_stopping: training_hooks.append(early_stopping_hook) # from https://stackoverflow.com/questions/45719176/how-to-display-runtime-statistics-in-tensorboard-using-estimator-api-in-a-distri if args.metadata_hook_saving_frequency: runtime_stats_hook = estimator_utils.MetadataHook( save_secs=args.metadata_hook_saving_frequency, output_dir=str(args.model_dir)) training_hooks.append(runtime_stats_hook) if args.profiler_hook: profiler_hook = tf.train.ProfilerHook( save_steps=10, save_secs=None, output_dir=str(os.path.join(args.model_dir, "timelines")), show_memory=True) training_hooks.append(profiler_hook) # Debugging if args.tensorboard_debug_address: debug_hook = tf_debug.TensorBoardDebugHook( args.tensorboard_debug_address) training_hooks.append(debug_hook) if args.debug: debug_hook = tf_debug.LocalCLIDebugHook() training_hooks.append(debug_hook) if args.debug: debug_hook = tf_debug.DumpingDebugHook(args.debug_dump_path) training_hooks.append(debug_hook) with tf.name_scope("TrainSpec"): train_spec = tf.estimator.TrainSpec( input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards=num_shards, batch_size=args.train_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), max_steps=total_train_steps if not args.overwrite else 10, hooks=training_hooks) with tf.name_scope("EvalSpec_and_exporter"): with tf.name_scope("Exporter"): # TODO Define function to process the input e.g. seq for the whole user - this function used to simulate real data exporters = [] for key_metric in args.key_metrics: exporters.append( tf.estimator.BestExporter( name=key_metric, serving_input_receiver_fn=estimator_model. serving_input_fn(args), compare_fn=estimator_checkpointing. custom_checkpoint_compare_fn(default_key=key_metric), exports_to_keep=1, as_text=False)) with tf.name_scope("EvalSpec"): eval_spec = tf.estimator.EvalSpec( input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="validation", num_shards=num_shards, batch_size=args.validation_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_val, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), exporters=exporters if args.use_exporter else None, #TODO steps=eval_steps_during_train if not args.overwrite else 1, throttle_secs=throttle_secs, start_delay_secs=args.start_delay_secs) if train_steps_without_evaluation > 0: print( "Starting preliminary training for {} steps during which no evaluation is performed." .format(train_steps_without_evaluation)) estimator.train(input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards=num_shards, batch_size=args.train_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), max_steps=train_steps_without_evaluation if not args.overwrite else 10, hooks=training_hooks) # Export the model for the offchance that the metrics for validation don't improve after the first run # when I believe no export is performed export_dir = os.path.join(args.model_dir, "export", args.key_metrics[0]) estimator.export_savedmodel(export_dir, estimator_model.serving_input_fn(args), strip_default_attrs=True) print( "Starting Train and Evaluate for {} training steps with Evaluation every {} second(s) or {} steps for {} evaluation steps." .format(total_train_steps, throttle_secs, save_checkpoints_steps, eval_steps_during_train)) with tf.name_scope("Train_and_Evaluate"): tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) if args.exec_loc == "sagemaker": updated_model_path = estimator_sagemaker.sagemaker_postprocessing(args) predictor_param_names = [ "predictor_s3_input_path", "predictor_s3_output_path", "predictor_batch_size" ] predictor_params = [getattr(args, x) for x in predictor_param_names] if np.all([x is not None for x in predictor_params]): estimator_sagemaker.predict_s3_numpy( saved_model_path=updated_model_path, input_s3_path=args.predictor_s3_input_path, output_s3_path=args.predictor_s3_output_path, batch_size=args.predictor_batch_size) else: # Evaluate trained model steps_in_epoch_test = np.ceil(args.num_samples["test"] / args.validation_batch_size) shuffle_test = args.num_samples["train"] if args.shuffle_test else 1 with tf.name_scope("Evaluate_trained_model"): train_input_fn = lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards= num_shards, #Switch to one and adjust bs/num_gpu for single device batch_size=args. train_batch_size, #TODO Does that work for serving X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std) test_input_fn = lambda: estimator_model.input_fn( data_dir=data_dir, subset="test", num_shards=num_shards, batch_size=args.validation_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_test, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std) if not args.final_eval_multiple_models: # Find best checkpoint and its associated metrics best_checkpoint_path, best_checkpoint_metrics = estimator_checkpointing.best_checkpoint( model_dir=args.model_dir, eval_dir=estimator.eval_dir(), metric=args.key_metrics[0]) print("Best checkpoint: {}".format(best_checkpoint_path)) print("Best metrics: {}".format(best_checkpoint_metrics)) # Remove model_dir from previous run_config as that causes evaluation to ignore warm_start_from eval_run_config = deepcopy(run_config) setattr(eval_run_config, "_model_dir", None) # New estimator restarted with best result for user-specified metric estimator = estimator_model.create_estimator( eval_run_config, hparams, warm_start_from=best_checkpoint_path) train_results = estimator.evaluate(input_fn=train_input_fn, steps=steps_in_epoch_train) print("Final evaluation on train subset: {}".format( train_results)) test_results = estimator.evaluate(input_fn=test_input_fn, steps=steps_in_epoch_test) print( "Final evaluation on test subset: {}".format(test_results)) else: estimator_checkpointing.evaluate_multiple_checkpoints( model_dir=args.model_dir, eval_dir=estimator.eval_dir(), num_checkpoints=args.keep_checkpoint_max, metric=args.key_metrics[0], input_fn=test_input_fn, run_config=run_config, hparams=hparams, num_steps_in_eval=steps_in_epoch_test if not args.overwrite else 1) if args.clear_checkpoints: rm_graph_command = "for f in $(find {} -name 'graph.pbtxt'); do rm $f; done".format( str(model_dir)) rm_checkpoints_command = "for f in $(find {} -name 'model.ckpt-*'); do rm $f; done".format( str(model_dir)) process = subprocess.run(rm_graph_command, shell=True, check=True) process = subprocess.run(rm_checkpoints_command, shell=True, check=True) print("Cleared model_dir: {}".format(str(model_dir)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--watch_gpu', required=True, type=int, help="watch gpu id filled Set it the same as visible gpu id") parser.add_argument('--debug', default=True, type=bool) parser.add_argument('--stop_globalstep', default=1000, type=int) parser.add_argument('--checkpoint_dir', default="checkpoint_dir",type=str) parser.add_argument('--task_index',default=0, type=int) prof_save_step = cfg.PROFILER_SAVE_STEP #120 sum_save_step = cfg.SUMMARY_SAVE_STEP #500 FLAGS, unparsed = parser.parse_known_args() initial_learning_rate = cfg.LEARNING_RATE decay_steps = cfg.DECAY_STEPS decay_rate = cfg.DECAY_RATE staircase = cfg.STAIRCASE ########################dir####################################### singlepipe_dir = "Single_Pipe_train_logs" if not os.path.exists(singlepipe_dir): os.makedirs(singlepipe_dir) inside_bsnQnM_dir = "Single_Pipe"+cfg.BS_NT_MUL_PREFIX logrootpath = os.path.join(singlepipe_dir, inside_bsnQnM_dir) if not os.path.exists(logrootpath): os.makedirs(logrootpath) fpslog_name = "Single_Pipe"+cfg.BS_NT_MUL_PREFIX+ "fps_log.txt" concated_path = logrootpath + "/" + fpslog_name checkpoint_dir = FLAGS.checkpoint_dir if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) gpulog_name = "Single_Pipe"+"gpu"+str(FLAGS.watch_gpu)+cfg.BS_NT_MUL_PREFIX+"_gpulog.txt" ###########################gpusubprocess############################## def start_gpulog(path, fname): # has to be called before start of training gpuinfo_path = path + "/" + fname with open(gpuinfo_path, 'w'): argument = 'timestamp,count,gpu_name,gpu_bus_id,memory.total,memory.used,utilization.gpu,utilization.memory' try: proc = subprocess.Popen( ['nvidia-smi --format=csv --query-gpu=%s %s %s %s' % (argument, ' -l 1', '-i '+ str(FLAGS.watch_gpu), '-f '+ gpuinfo_path)], shell=True) except KeyboardInterrupt: try: proc.terminate() except OSError: pass proc.wait() return proc #########################pipeline################################### tf.reset_default_graph() image_producer = Pascal_voc('train') (current_index, image, label) = image_producer.get_one_image_label_element() current_index = tf.Print(current_index, data=[current_index], message="CURRENT INDEX OF IMAGE IS :") image_shape = (image_producer.image_size, image_producer.image_size, 3) label_size = (image_producer.cell_size, image_producer.cell_size, 25) # possible value is 0 or 1 processed_queue = tf.FIFOQueue(capacity=int(image_producer.batch_size * 1.5), shapes = [image_shape, label_size], dtypes = [tf.float32, tf.float32], name = 'processed_queue') enqueue_processed_op = processed_queue.enqueue([image, label]) num_enqueue_threads = min(image_producer.num_enqueue_threads, image_producer.gt_labels_length) queue_runner = tf.train.QueueRunner(processed_queue, [enqueue_processed_op] * num_enqueue_threads) tf.train.add_queue_runner(queue_runner) (images, labels) = processed_queue.dequeue_many(image_producer.batch_size) if FLAGS.debug == True: images = tf.Print(images, data=[processed_queue.size()], message="Worker %d get_batch(), BatchSize %d, Queues left:" % ( FLAGS.task_index, cfg.BATCH_SIZE)) #########################graph################################### with tf.device("/device:GPU:"+str(FLAGS.watch_gpu)): yolo = YOLONet(images, labels) global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay( initial_learning_rate, global_step, decay_steps, decay_rate, staircase, name='learning_rate') optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) train_op = slim.learning.create_train_op( yolo.total_loss, optimizer, global_step=global_step) #########################hook##################################### profiler_hook = tf.train.ProfilerHook(save_steps=prof_save_step, output_dir=logrootpath, show_memory=True,show_dataflow=True) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook(save_steps=sum_save_step, output_dir=logrootpath, summary_op=summary_op) debug_hook=tf_debug.DumpingDebugHook("debug_dir") if FLAGS.debug == True: tensors_to_log = [global_step, yolo.total_loss] def formatter(curvals): print("Global step %d, Loss %f!" % ( curvals[global_step], curvals[yolo.total_loss])) logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=10, formatter=formatter) hooks = [debug_hook,tf.train.StopAtStepHook(last_step=FLAGS.stop_globalstep), logging_hook, profiler_hook, summary_hook] else: hooks = [debug_hook,tf.train.StopAtStepHook(last_step=FLAGS.stop_globalstep), profiler_hook, summary_hook] config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # config.gpu_options.allocator_type = 'BFC' # config.gpu_options.per_process_gpu_memory_fraction = 0.8 proc = start_gpulog(logrootpath, gpulog_name) #######################train##################################### print('Start training ...') with tf.train.MonitoredTrainingSession(config=config, hooks=hooks, checkpoint_dir=checkpoint_dir, save_checkpoint_secs=3600) as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) start_global_step_value = sess.run(global_step) timer = Timer(start_global_step_value) iters_per_toc = 20 txtForm = "Training speed: local avg %f fps, global %f fps, loss %f, global step: %d, predict to wait %s" local_max_iter = FLAGS.stop_globalstep - start_global_step_value timer.tic() yolo_loss, global_step_value, _ = sess.run([yolo.total_loss, global_step, train_op]) n = 1 while not sess.should_stop(): n = n + 1 if n > 0 and n % iters_per_toc == 0: if n > 0 and n % iters_per_toc == 0: local_avg_fps, global_avg_fps = timer.toc(iters_per_toc, global_step_value) timetowait = timer.remain(n, local_max_iter) txtData = local_avg_fps, global_avg_fps, yolo_loss, global_step_value, timetowait print(txtForm % txtData) with open(concated_path, 'a+') as log: log.write("%.4f,%.4f,%.4f,%d,%s\n" % txtData) timer.tic() yolo_loss, global_step_value, _ = sess.run([yolo.total_loss, global_step, train_op]) coord.request_stop() coord.join(threads) print('Done training.') try: proc.terminate() except OSError: pass print("Kill subprocess failed. Kill nvidia-smi mannually")