def main(_): """Run training and evaluation """ tf.logging.set_verbosity(tf.logging.INFO) run_config = tf.estimator.RunConfig() hparams = { 'learning_rate': FLAGS.learning_rate, 'dropout_rate': 0.4, 'data_directory': FLAGS.data_directory } mnist_classifier = tf.estimator.Estimator(model_fn=model.head_model_fn, model_dir=FLAGS.model_directory, config=run_config, params=hparams) hooks = [] if FLAGS.debug_port is not None: debug_hook = tf_debug.TensorBoardDebugHook("localhost:{}".format( FLAGS.debug_port)) hooks.append(debug_hook) tf.estimator.train_and_evaluate(mnist_classifier, get_train_spec(hooks), get_eval_spec())
def predict(test_file, model_dir): # Load the model estimator = tf.estimator.Estimator(model_fn, model_dir=model_dir, params=params) # Create the input_fn #input_fn = tf.estimator.inputs.numpy_input_fn(x={'image' : inputs}, num_epochs=1, shuffle=False) # Prepare hooks for debugging hooks = [tf_debug.TensorBoardDebugHook(grpc_debug_server_addresses="dev:6064")] predictions = estimator.predict(input_fn=lambda: dataset_input_fn('test')) #predictions = estimator.predict(input_fn=lambda: dataset_input_fn('test'), hooks = hooks) # Predict! predictions_list = [] predictions_list = list(predictions) predicted_label = predictions_list[0] print('prediction = {}'.format(predicted_label)) #print('max = {}'.format(predictions_list[np.argmax(predictions_list)])) # Print tensorboard data print('tensorboard --logdir=' + str(model_dir) + ' --port 6006 --debugger_port 6064') # Visualize predictions based on single test TFrecord visualize_pred(test_file, predictions_list, model_dir)
def main(unused_argv): classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="./cnn_model_mel") tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=50) hook = tf_debug.TensorBoardDebugHook("sunny-workstation:7000") # test_solution = data_utility.AudioPrepare() # train_input_fn = test_solution.tf_input_fn_maker(is_training=True, n_epoch=100) # Evaluate the model and print results test_solution = data_utility.AudioPrepare() test_input_fn = test_solution.tf_input_fn_maker(is_training=False, n_epoch=1) # classifier.train( # input_fn=train_input_fn, # steps=20000, # hooks=[logging_hook]) # eval_results = classifier.evaluate(input_fn=test_input_fn, steps=100) # print(eval_results) eval_results = classifier.evaluate(input_fn=test_input_fn, steps=3000) print(eval_results)
def add_debug_hooks(hooks): if FLAGS.debug_tb: debug_hook = tf_debug.TensorBoardDebugHook("pawel-workstation:8080") hooks.append(debug_hook) elif FLAGS.debug_cli: debug_hook = tf_debug.LocalCLIDebugHook() debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) hooks.append(debug_hook)
def main(_): # Generate some fake Iris data. # It is okay for this example because this example is about how to use the # debugger, not how to use machine learning to solve the Iris classification # problem. def training_input_fn(): return ({ "features": tf.random_normal([128, 4]) }, tf.random_uniform([128], minval=0, maxval=3, dtype=tf.int32)) def test_input_fn(): return ({ "features": tf.random_normal([32, 4]) }, tf.random_uniform([32], minval=0, maxval=3, dtype=tf.int32)) feature_columns = [ tf.feature_column.numeric_column("features", shape=(4, )) ] # Build 3 layer DNN with 10, 20, 10 units respectively. model_dir = FLAGS.model_dir or tempfile.mkdtemp( prefix="debug_tflearn_iris_") classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir=model_dir) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") hooks = [] if FLAGS.debug: config_file_path = (tempfile.mktemp(".tfdbg_config") if FLAGS.use_random_config_path else None) hooks.append( tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type, dump_root=FLAGS.dump_root, config_file_path=config_file_path)) elif FLAGS.tensorboard_debug_address: hooks.append( tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)) # Train model, using tfdbg hook. classifier.train(training_input_fn, steps=FLAGS.train_steps, hooks=hooks) # Evaluate accuracy, using tfdbg hook. accuracy_score = classifier.evaluate(test_input_fn, steps=FLAGS.eval_steps, hooks=hooks)["accuracy"] print("After training %d steps, Accuracy = %f" % (FLAGS.train_steps, accuracy_score)) # Make predictions, using tfdbg hook. predict_results = classifier.predict(test_input_fn, hooks=hooks) print("A prediction result: %s" % next(predict_results))
def get_hooks(debug_cli, debug_ui): hooks = [] if debug_cli: cli_debug_hook = tf_debug.LocalCLIDebugHook() cli_debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) hooks.append(cli_debug_hook) elif debug_ui: debug_host = "{}:5002".format(platform.node()) hooks.append(tf_debug.TensorBoardDebugHook(debug_host, send_traceback_and_source_code=False)) print("Debugger is running on {}".format(debug_host)) return hooks
def main(unused_argv): # Load typical and novel datasets novel_results = '/home/hannah/src/MastcamCAE/results/DW_udr_12-8-3_7-5-3_nodrop_epochs15' typical_results = '/home/hannah/src/MastcamCAE/results/train_udr_12-8-3_7-5-3_nodrop_epochs15' typical_data = dataset.load_diff_images(typical_results) # Returns np.array typical_labels = np.zeros([typical_data.shape[0],1], dtype=np.int32) novel_data = dataset.load_diff_images(novel_results) # Returns np.array novel_labels = np.ones([novel_data.shape[0],1], dtype=np.int32) # Convert to training and eval sets train_data = np.concatenate([typical_data[:98700,:,:,:], novel_data[:300,:,:,:]]) train_labels = np.concatenate([typical_labels[:98700], novel_labels[:300]]) eval_data = np.concatenate([typical_data[98700:,:,:,:], novel_data[300:,:,:,:]]) eval_labels = np.concatenate([typical_labels[98700:], novel_labels[300:]]) # train_data = np.concatenate([typical_data[:4500,:,:,:], novel_data[:300,:,:,:]]) # train_labels = np.concatenate([typical_labels[:4500], novel_labels[:300]]) # eval_data = np.concatenate([typical_data[4500:,:,:,:], novel_data[300:,:,:,:]]) # eval_labels = np.concatenate([typical_labels[4500:], novel_labels[300:]]) # Create the Estimator multispec_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="/home/hannah/src/MastcamCAE/saved_sessions/multispec_convnet_model_nodrop_eps15_udr_60k_seed42") # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) debug_hook = tf_debug.TensorBoardDebugHook("ops5.sese.asu.edu:6064") # Train the model # train_input_fn = tf.estimator.inputs.numpy_input_fn( # x={"x": train_data}, # y=train_labels, # batch_size=50, # num_epochs=None, # shuffle=True) # multispec_classifier.train( # input_fn=train_input_fn, # steps=20000, # hooks=[logging_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = multispec_classifier.evaluate(input_fn=eval_input_fn) print(eval_results)
def hooks(self, mode): hooks = [] if self._debug: hooks.append( tfdbg.TensorBoardDebugHook("localhost:6007") ) if self._profile_secs is not None: hooks.append( TensorboardProfilerHook( save_secs=self._log_secs, output_dir=self.path) ) return hooks
def run(job_dir, train_iters, estimator, model_cls, dataset, train_batch_size, eval_batch_size, eval_steps, num_parallel_batches, shuffle_buffer_size, prefetch_buffer_size, no_eval, debug, debug_address): ############ # Datasets # ############ dataset_train = dataset.read(split='train') ####### # Run # ####### try: global_step = estimator.get_variable_value('global_step') except ValueError: global_step = 1 tf.logging.info('Start training for %d.', global_step) hooks = [] if not no_eval: dataset_test = dataset.read(split='test') hooks.append( EvaluationRunHook(estimator, build_input_fn(dataset_test, eval_batch_size, map_fn=strip_dict_arg( model_cls.eval_map_fn), shuffle_and_repeat=False), eval_steps, summary=False)) if debug: hooks.append(tfdbg.TensorBoardDebugHook(debug_address)) # Run training for `train_iters` times estimator.train( build_input_fn(dataset_train, train_batch_size, map_fn=strip_dict_arg(model_cls.map_fn), num_parallel_batches=num_parallel_batches, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=prefetch_buffer_size, global_step=global_step, shuffle_and_repeat=True), max_steps=train_iters, # Run evaluation every `eval_steps` iterations hooks=hooks)
def main(unused_argv): classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="./cnn_model_gfcc") tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50) hook = tf_debug.TensorBoardDebugHook("sunny-workstation:7000") test_solution = data_utility.AudioPrepare() # train_input_fn = test_solution.tf_input_fn_maker(is_training=True, n_epoch=100) # classifier.train( # input_fn=train_input_fn, # steps=20000, # hooks=[logging_hook]) # Evaluate the model and print results test_solution = data_utility.AudioPrepare() test_input_fn = test_solution.tf_input_fn_maker(is_training=False, n_epoch=1) # # eval_results = classifier.evaluate(input_fn=test_input_fn, steps=100) # print(eval_results) # predict_input_fn=test_solution.tf_input_fn_maker_eval() # predictions=classifier.predict(input_fn=predict_input_fn) predictions = classifier.predict(input_fn=test_input_fn) i = 0 with open('cnn_gfcc_test.txt', 'w+') as file: for var in predictions: print(var['classes']) file.write(str(var['classes']) + '\n') i = i + 1 # if i==100: # break with open('cnn_gfcc_test_pro.txt', 'w+') as file: for var in predictions: print(var['probabilities']) file.write(str(var['probabilities']) + '\n') i = i + 1
def __init__(self, params=None, aux_config=None, run_config=None): self._comet_experiment = None self._estimator = None self.aux_config = aux_config or {} self._hooks = ( [] if not self.aux_config.get("debug") else [tf_debug.LocalCLIDebugHook()] if self.aux_config.get("debug") == "cli" else [ tf_debug.TensorBoardDebugHook( "localhost:{}".format(self.aux_config.get("debug")) ) ] ) self.run_config = RunConfig(**(run_config or {})) self.params = self.set_params() if params: self.params.update(params)
def create_hooks(use_tfdbg=False, use_dbgprofile=False, dbgprofile_kwargs=None, use_validation_monitor=False, validation_monitor_kwargs=None, use_early_stopping=False, early_stopping_kwargs=None): """Create train and eval hooks for Experiment.""" train_hooks = [] eval_hooks = [] if use_tfdbg: #hook = debug.LocalCLIDebugHook() hook = debug.TensorBoardDebugHook('127.0.0.1:9990', send_traceback_and_source_code=False) train_hooks.append(hook) eval_hooks.append(hook) if use_dbgprofile: # Recorded traces can be visualized with chrome://tracing/ # The memory/tensor lifetime is also profiled tf.logging.info("Using ProfilerHook") defaults = dict(save_steps=10, show_dataflow=True, show_memory=True) defaults.update(dbgprofile_kwargs) train_hooks.append(tf.train.ProfilerHook(**defaults)) if use_validation_monitor: tf.logging.info("Using ValidationMonitor") train_hooks.append( tf.contrib.learn.monitors.ValidationMonitor( hooks=eval_hooks, **validation_monitor_kwargs)) if use_early_stopping: tf.logging.info("Using EarlyStoppingHook") hook = metrics_hook.EarlyStoppingHook(**early_stopping_kwargs) # Adding to both training and eval so that eval aborts as well train_hooks.append(hook) eval_hooks.append(hook) return train_hooks, eval_hooks
def train(config, data_dir, my_model_fn=model_fn): V, embed_matrix = get_vocab_embedding_matrix(config, data_dir) estimator = get_estimator(config, embed_matrix, my_model_fn) if config.get('eval.enable', True): hooks = [ get_eval_hook(estimator, lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V)), name='eval', every_n_steps=config.eval.eval_steps), # get_eval_hook(estimator, # lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), # num_examples=config.eval.big_num_examples), # name='eval_big', # every_n_steps=config.eval.big_eval_steps), # # get_eval_hook(estimator, # lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), # file_name='train.tsv', num_examples=config.eval.big_num_examples), # name='train_big', # every_n_steps=config.eval.big_eval_steps), ] else: hooks = [] if config.get('eval.debug.tensorboard', False): hooks += [tf_debug.TensorBoardDebugHook('localhost:%s' % config.get('eval.debug.tensorboard_port', 6068), send_traceback_and_source_code=False)] if config.get('eval.debug.cli', False): hooks = [tf_debug.LocalCLIDebugHook()] return estimator.train( input_fn=lambda: train_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V)), hooks=hooks, max_steps=config.optim.max_iters )
def create_estimator_and_specs(run_config): """Creates an Experiment configuration based on the estimator and input fn.""" model_params = tf.contrib.training.HParams( num_layers=FLAGS.num_layers, num_nodes=FLAGS.num_nodes, batch_size=FLAGS.batch_size, num_conv=ast.literal_eval(FLAGS.num_conv), conv_len=ast.literal_eval(FLAGS.conv_len), num_classes=get_num_classes(), learning_rate=FLAGS.learning_rate, gradient_clipping_norm=FLAGS.gradient_clipping_norm, cell_type=FLAGS.cell_type, batch_norm=FLAGS.batch_norm, dropout=FLAGS.dropout) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params=model_params) debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address) hooks = [debug_hook] train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn( mode=tf.estimator.ModeKeys.TRAIN, tfrecord_pattern=FLAGS.training_data, batch_size=FLAGS.batch_size), hooks=hooks, max_steps=FLAGS.steps) eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn( mode=tf.estimator.ModeKeys.EVAL, tfrecord_pattern=FLAGS.eval_data, batch_size=FLAGS.batch_size)) return estimator, train_spec, eval_spec
model_path = curr_path.replace('train.py', 'model_for_CRF.py') # Define the path of your factored out model.py file #model_file = '/some/path/model_for_CRF.py' model_file = './model_for_CRF.py' # Now copy the training script and the model file to # model_dir -- the same directory specified when creating the Estimator # Note: copy over more files if there are other important dependencies. os.mkdir(model_dir) shutil.copy(curr_path, model_dir) shutil.copy(model_path, model_dir) # Create a LocalCLIDebugHooks and use it as a monitor when calling fit() hooks = [tf_debug.TensorBoardDebugHook(grpc_debug_server_addresses="dev:6064")] #hooks = [tf_debug.LocalCLIDebugHook(ui_type="readline")] # Hooks to the manual debugger # Training/Evaluation Loop for e in range(params.train_epochs): print('Epoch: ' + str(e)) #estimator.train(input_fn=lambda: dataset_input_fn('train'), hooks=hooks) # RAN estimator.train(input_fn=lambda: dataset_input_fn('train')) print('### validate ###') estimator.evaluate(input_fn=lambda: dataset_input_fn('valid')) print('tensorboard --logdir=' + str(model_dir)) def make_serving_input_receiver_fn(): inputs = {
def main(): # tf.enable_eager_execution() # # # create fake args for debugging # sys.argv = [''] # args = parse_arguments() # args.batch_size = 10 # args.max_steps = 5000 args = parse_arguments() graph_data = load_data_graphsage(args.data_dir) """ Sample defined as a custom tf dataset """ sample_train = make_sample(args.sampler, args) # sample subgraph according to graph sampling scheme args.sampler input_fn = augment_sample(graph_data, args, sample_train) # augment subgraph with vertex labels and features """ Predictor class and loss function """ # hyperparams vertex_embedding_params = { 'embedding_dim': args.embedding_dim, 'embedding_trainable': True, 'embedding_checkpoint': None } params={ **vertex_embedding_params, 'hidden_units' : [200, 200], # Jaan net 'n_classes': max(graph_data.classes)+1, 'num_vertices': graph_data.num_vertices, 'batch_size': args.batch_size } classifier_predictor_and_loss = make_nn_class_predictor( label_task_weight=args.label_task_weight, regularization=_adjust_regularization(args.global_regularization, args.batch_size), global_optimizer=_make_global_optimizer(args), embedding_optimizer=lambda: tf.train.GradientDescentOptimizer( _adjust_learning_rate(args.embedding_learning_rate, args.batch_size)) ) node_classifier = tf.estimator.Estimator( model_fn=classifier_predictor_and_loss, params=params, model_dir=args.train_dir) """ Put it together for the optimization """ # some extra logging hooks = [ tf.train.LoggingTensorHook({ 'kappa_edges': 'kappa_edges_in_batch/value'}, every_n_iter=100) ] if args.profile: hooks.append(tf.train.ProfilerHook(save_secs=30)) if args.debug: from tensorflow.python import debug as tfdbg hooks.append(tfdbg.TensorBoardDebugHook('localhost:6004')) node_classifier.train( input_fn=input_fn, max_steps=args.max_steps, hooks=hooks) """ Evaluate """ node_classifier.evaluate(input_fn=augment_sample(graph_data, args, sample_train, 2000), name="node2vec_eval")
def main(): tf.logging.set_verbosity(tf.logging.INFO) # Path to file specifying all runtime and model parameters and how to process user command line input. config_file_path = os.path.join(PROJECT_MODEL_ROOT, "configs/default.json") # Argparse namespace combining json defaults and user command line inputs args = estimator_utils.init_basic_argument_parser(config_file_path) # Transfer all k:v pairs from the Argparse namespace to HParams hparams = tf.contrib.training.HParams(**vars(args)) # Print stats about the current run print_run_info(args) # Calculate the number of steps needed to complete one epoch for each of the subsets steps_in_epoch_train = np.ceil(args.num_samples["train"] / args.train_batch_size) steps_in_epoch_val = np.ceil(args.num_samples["validation"] / args.validation_batch_size) # Number of training steps to perform during train_and_evaluate total_train_steps = int(steps_in_epoch_train * args.num_epochs) # Minimum number of steps during which no early stopping can occur train_steps_without_stopping = steps_in_epoch_train * args.train_epochs_without_stopping # Number of steps during which no metric improvement happened that is needed to initiate early stopping max_train_steps_without_improvement = int( steps_in_epoch_train * args.max_train_epochs_without_improvement) # Number of evaluation steps that are performed during each of the calls to evaluation during train_and_evaluate eval_steps_during_train = int(steps_in_epoch_val * args.eval_pc_during_train) # Number of steps during which evaluation is not performed train_steps_without_evaluation = int(steps_in_epoch_train * args.delay_evaluation_epochs) throttle_secs = args.save_checkpoints_secs save_checkpoints_steps = None # Only one of secs and steps for checkpointing frequency is allowed to be saved assert (args.save_checkpoints_secs is not None) ^ (args.checkpoint_freq_epochs is not None) if args.checkpoint_freq_epochs is not None: save_checkpoints_steps = np.ceil( steps_in_epoch_train * args.checkpoint_freq_epochs) # TODO Ensure this is never zero throttle_secs = 1 # Number of towers num_shards = args.num_gpu if args.num_gpu > 0 else 1 # Path object pointing to the location where the checkpoints and results are saved # If model path is provided then load a previously instantiated model and train/evaluate # using the previous values. folder_naming_vars = [] for x in args.folder_naming_vars: folder_naming_vars.append( eval(x)) # For some reason list comprehension doesn't work execution_date = time.strftime("%Y%b%d", time.localtime( )) if args.execution_date is None else args.execution_date # Sagemaker provides model_dir or when running elsewhere creates new model_dir or loads previous run via model_path if hparams.model_dir is None: model_dir = retrieve_model_dir(args.log_dir_path, args.model_path, execution_date, *folder_naming_vars) hparams.set_hparam("model_dir", model_dir) setattr(args, "model_dir", model_dir) # Path pointing to the location of the current data set (e.g. .../numpy/lastfm_10_pc) data_dir = os.path.join( args.data_dir_path if args.data_dir_path else "", "" if args.exec_loc == "sagemaker" else args.dataset, "tfrecords" if args.input_data_format == "tfrecords" else "", "sharded" if args.exec_loc == "sagemaker" else "") # Tensorflow device allocation settings config_proto = tf.ConfigProto( allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement) config_proto.gpu_options.allow_growth = True # Object specifying current run settings e.g. logging frequency and num of check points saved. run_config = tf.estimator.RunConfig( tf_random_seed=args.tf_random_seed, model_dir=args.model_dir, session_config=config_proto, save_summary_steps=20, save_checkpoints_steps=save_checkpoints_steps if not args.overwrite else 1, save_checkpoints_secs=args.save_checkpoints_secs, keep_checkpoint_max=args.keep_checkpoint_max, log_step_count_steps=100, ) # Instantiate an Estimator object with the model_fn from this module. estimator = estimator_model.create_estimator(run_config, hparams) # The degree of shuffling - int. Check tf.Data.dataset.shuffle() for additional documentation. shuffle_train = int(args.num_samples["train"] * args.shuffle_train) if args.shuffle_train else 1 shuffle_val = int(args.num_samples["val"] * args.shuffle_test) if args.shuffle_test else 1 additional_arrays = ["weights"] if args.use_weights else [] # https://cloud.google.com/blog/products/gcp/easy-distributed-training-with-tensorflow-using-tfestimatortrain-and-evaluate-on-cloud-ml-engine with tf.name_scope("TrainSpec_and_hook"): with tf.name_scope("Early_stop_hook"): try: os.makedirs(estimator.eval_dir()) except FileExistsError: pass training_hooks = [] early_stopping_hook = estimator_utils.make_early_stopping_hook( estimator=estimator, metric_name=args.key_metrics[0], max_train_steps_without_improvement= max_train_steps_without_improvement, min_steps=train_steps_without_stopping, run_every_secs=None, run_every_steps=1) if args.early_stopping: training_hooks.append(early_stopping_hook) # from https://stackoverflow.com/questions/45719176/how-to-display-runtime-statistics-in-tensorboard-using-estimator-api-in-a-distri if args.metadata_hook_saving_frequency: runtime_stats_hook = estimator_utils.MetadataHook( save_secs=args.metadata_hook_saving_frequency, output_dir=str(args.model_dir)) training_hooks.append(runtime_stats_hook) if args.profiler_hook: profiler_hook = tf.train.ProfilerHook( save_steps=10, save_secs=None, output_dir=str(os.path.join(args.model_dir, "timelines")), show_memory=True) training_hooks.append(profiler_hook) # Debugging if args.tensorboard_debug_address: debug_hook = tf_debug.TensorBoardDebugHook( args.tensorboard_debug_address) training_hooks.append(debug_hook) if args.debug: debug_hook = tf_debug.LocalCLIDebugHook() training_hooks.append(debug_hook) if args.debug: debug_hook = tf_debug.DumpingDebugHook(args.debug_dump_path) training_hooks.append(debug_hook) with tf.name_scope("TrainSpec"): train_spec = tf.estimator.TrainSpec( input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards=num_shards, batch_size=args.train_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), max_steps=total_train_steps if not args.overwrite else 10, hooks=training_hooks) with tf.name_scope("EvalSpec_and_exporter"): with tf.name_scope("Exporter"): # TODO Define function to process the input e.g. seq for the whole user - this function used to simulate real data exporters = [] for key_metric in args.key_metrics: exporters.append( tf.estimator.BestExporter( name=key_metric, serving_input_receiver_fn=estimator_model. serving_input_fn(args), compare_fn=estimator_checkpointing. custom_checkpoint_compare_fn(default_key=key_metric), exports_to_keep=1, as_text=False)) with tf.name_scope("EvalSpec"): eval_spec = tf.estimator.EvalSpec( input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="validation", num_shards=num_shards, batch_size=args.validation_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_val, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), exporters=exporters if args.use_exporter else None, #TODO steps=eval_steps_during_train if not args.overwrite else 1, throttle_secs=throttle_secs, start_delay_secs=args.start_delay_secs) if train_steps_without_evaluation > 0: print( "Starting preliminary training for {} steps during which no evaluation is performed." .format(train_steps_without_evaluation)) estimator.train(input_fn=lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards=num_shards, batch_size=args.train_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std), max_steps=train_steps_without_evaluation if not args.overwrite else 10, hooks=training_hooks) # Export the model for the offchance that the metrics for validation don't improve after the first run # when I believe no export is performed export_dir = os.path.join(args.model_dir, "export", args.key_metrics[0]) estimator.export_savedmodel(export_dir, estimator_model.serving_input_fn(args), strip_default_attrs=True) print( "Starting Train and Evaluate for {} training steps with Evaluation every {} second(s) or {} steps for {} evaluation steps." .format(total_train_steps, throttle_secs, save_checkpoints_steps, eval_steps_during_train)) with tf.name_scope("Train_and_Evaluate"): tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) if args.exec_loc == "sagemaker": updated_model_path = estimator_sagemaker.sagemaker_postprocessing(args) predictor_param_names = [ "predictor_s3_input_path", "predictor_s3_output_path", "predictor_batch_size" ] predictor_params = [getattr(args, x) for x in predictor_param_names] if np.all([x is not None for x in predictor_params]): estimator_sagemaker.predict_s3_numpy( saved_model_path=updated_model_path, input_s3_path=args.predictor_s3_input_path, output_s3_path=args.predictor_s3_output_path, batch_size=args.predictor_batch_size) else: # Evaluate trained model steps_in_epoch_test = np.ceil(args.num_samples["test"] / args.validation_batch_size) shuffle_test = args.num_samples["train"] if args.shuffle_test else 1 with tf.name_scope("Evaluate_trained_model"): train_input_fn = lambda: estimator_model.input_fn( data_dir=data_dir, subset="train", num_shards= num_shards, #Switch to one and adjust bs/num_gpu for single device batch_size=args. train_batch_size, #TODO Does that work for serving X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_train, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std) test_input_fn = lambda: estimator_model.input_fn( data_dir=data_dir, subset="test", num_shards=num_shards, batch_size=args.validation_batch_size, X_cols_to_use=args.X_cols_to_use, input_data_format=args.input_data_format, shuffle=shuffle_test, additional_arrays=additional_arrays, delta_t_mean=args.delta_t_mean, delta_t_std=args.delta_t_std) if not args.final_eval_multiple_models: # Find best checkpoint and its associated metrics best_checkpoint_path, best_checkpoint_metrics = estimator_checkpointing.best_checkpoint( model_dir=args.model_dir, eval_dir=estimator.eval_dir(), metric=args.key_metrics[0]) print("Best checkpoint: {}".format(best_checkpoint_path)) print("Best metrics: {}".format(best_checkpoint_metrics)) # Remove model_dir from previous run_config as that causes evaluation to ignore warm_start_from eval_run_config = deepcopy(run_config) setattr(eval_run_config, "_model_dir", None) # New estimator restarted with best result for user-specified metric estimator = estimator_model.create_estimator( eval_run_config, hparams, warm_start_from=best_checkpoint_path) train_results = estimator.evaluate(input_fn=train_input_fn, steps=steps_in_epoch_train) print("Final evaluation on train subset: {}".format( train_results)) test_results = estimator.evaluate(input_fn=test_input_fn, steps=steps_in_epoch_test) print( "Final evaluation on test subset: {}".format(test_results)) else: estimator_checkpointing.evaluate_multiple_checkpoints( model_dir=args.model_dir, eval_dir=estimator.eval_dir(), num_checkpoints=args.keep_checkpoint_max, metric=args.key_metrics[0], input_fn=test_input_fn, run_config=run_config, hparams=hparams, num_steps_in_eval=steps_in_epoch_test if not args.overwrite else 1) if args.clear_checkpoints: rm_graph_command = "for f in $(find {} -name 'graph.pbtxt'); do rm $f; done".format( str(model_dir)) rm_checkpoints_command = "for f in $(find {} -name 'model.ckpt-*'); do rm $f; done".format( str(model_dir)) process = subprocess.run(rm_graph_command, shell=True, check=True) process = subprocess.run(rm_checkpoints_command, shell=True, check=True) print("Cleared model_dir: {}".format(str(model_dir)))
def main(_): # Load datasets. if FLAGS.fake_data: def training_input_fn(): return ({ "features": tf.random_normal([128, 4]) }, tf.random_uniform([128], minval=0, maxval=3, dtype=tf.int32)) def test_input_fn(): return ({ "features": tf.random_normal([32, 4]) }, tf.random_uniform([32], minval=0, maxval=3, dtype=tf.int32)) feature_columns = [ tf.feature_column.numeric_column("features", shape=(4, )) ] else: training_data_path, test_data_path = maybe_download_data( FLAGS.data_dir) column_names = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "label" ] batch_size = 32 def training_input_fn(): return tf.contrib.data.make_csv_dataset([training_data_path], batch_size, column_names=column_names, label_name="label") def test_input_fn(): return tf.contrib.data.make_csv_dataset([test_data_path], batch_size, column_names=column_names, label_name="label") feature_columns = [ tf.feature_column.numeric_column(feature) for feature in column_names[:-1] ] # Build 3 layer DNN with 10, 20, 10 units respectively. model_dir = FLAGS.model_dir or tempfile.mkdtemp( prefix="debug_tflearn_iris_") classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3, model_dir=model_dir) if FLAGS.debug and FLAGS.tensorboard_debug_address: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") hooks = [] if FLAGS.debug: hooks.append( tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type, dump_root=FLAGS.dump_root)) elif FLAGS.tensorboard_debug_address: hooks.append( tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)) # Train model, using tfdbg hook. classifier.train(training_input_fn, steps=FLAGS.train_steps, hooks=hooks) # Evaluate accuracy, using tfdbg hook. accuracy_score = classifier.evaluate(test_input_fn, steps=FLAGS.eval_steps, hooks=hooks)["accuracy"] print("After training %d steps, Accuracy = %f" % (FLAGS.train_steps, accuracy_score)) # Make predictions, using tfdbg hook. predict_results = classifier.predict(test_input_fn, hooks=hooks) print("A prediction result: %s" % next(predict_results))
#treshold on what messages are to be logged tf.logging.set_verbosity(tf.logging.INFO) #importing debug library from tensorflow.python import debug as tf_debug # ## Debugger # # ### Uncomment the below line and execute the code to run the debugger. # # ### Go to the link once you start execution http://localhost:6006/ # In[2]: #Uncomment the below line to run the debugger #Add monitor=[hook] as a parameter to the estimators below hook = tf_debug.TensorBoardDebugHook("localhost:6064", send_traceback_and_source_code=False) #hook = tf_debug.LocalCLIDebugHook() # In[3]: def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer # Reshape X to 4-D tensor: [batch_size, width, height, channels] # MNIST images are 28x28 pixels, and have one color channel input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 # Computes 32 features using a 5x5 filter with ReLU activation. # Padding is added to preserve width and height.
def train(train_model, eval_model=None, debug_port=None): if eval_model is not None and 'eval_steps' not in eval_model.params: raise ValueError("eval_steps parameter has to be specified " "if eval_model is provided") hvd = train_model.hvd if hvd: master_worker = hvd.rank() == 0 else: master_worker = True # initializing session parameters sess_config = tf.ConfigProto(allow_soft_placement=True) # pylint: disable=no-member sess_config.gpu_options.allow_growth = True if hvd is not None: # pylint: disable=no-member sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) # defining necessary hooks hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)] if hvd is not None: hooks.append(BroadcastGlobalVariablesHook(0)) if master_worker: checkpoint_dir = train_model.params['logdir'] base_ckpt_dir = train_model.params['load_model'] else: checkpoint_dir = None base_ckpt_dir = None if eval_model is not None: # noinspection PyTypeChecker hooks.append( RunEvaluationHook( every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer), ), ) if master_worker: if train_model.params['save_checkpoint_steps'] is not None: # noinspection PyTypeChecker saver = tf.train.Saver(save_relative_paths=True) hooks.append(tf.train.CheckpointSaverHook( checkpoint_dir, saver=saver, save_steps=train_model.params['save_checkpoint_steps'], )) if train_model.params['print_loss_steps'] is not None: # noinspection PyTypeChecker hooks.append(PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer), )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker hooks.append(PrintSamplesHook( every_steps=train_model.params['print_samples_steps'], model=train_model, )) total_time = 0.0 bench_start = train_model.params.get('bench_start', 10) if debug_port: hooks.append( tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port)) ) if train_model.on_horovod: init_data_layer = train_model.get_data_layer().iterator.initializer else: init_data_layer = tf.group( [train_model.get_data_layer(i).iterator.initializer for i in range(train_model.num_gpus)] ) fine_tuning = (not base_ckpt_dir) or tf.train.latest_checkpoint(checkpoint_dir) if fine_tuning: scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) ) else: scaffold = TransferScaffold( local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) ) fetches = [train_model.train_op] try: total_objects = 0.0 # on horovod num_gpus is 1 for worker_id in range(train_model.num_gpus): fetches.append(train_model.get_num_objects_per_step(worker_id)) except NotImplementedError: deco_print("WARNING: Can't compute number of objects per step, since " "train model does not define get_num_objects_per_step method.") # starting training if fine_tuning: sess = TransferMonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks, base_ckpt_dir=base_ckpt_dir, load_fc=train_model.params['load_fc']) else: sess = tf.train.MonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks) step = 0 num_bench_updates = 0 while True: if sess.should_stop(): break tm = time.time() try: feed_dict = {} iter_size = train_model.params.get('iter_size', 1) if iter_size > 1: feed_dict[train_model.skip_update_ph] = step % iter_size != 0 if step % iter_size == 0: if step >= bench_start: num_bench_updates += 1 fetches_vals = sess.run(fetches, feed_dict) else: # necessary to skip "no-update" steps when iter_size > 1 def run_with_no_hooks(step_context): return step_context.session.run(fetches, feed_dict) fetches_vals = sess.run_step_fn(run_with_no_hooks) except tf.errors.OutOfRangeError: break if step >= bench_start: total_time += time.time() - tm if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) if train_model.params['print_bench_info_steps'] is not None: if step % train_model.params['print_bench_info_steps'] == 0: total_objects_cur = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: avg_objects = 1.0 * total_objects_cur / total_time deco_print("Avg objects per second: {:.3f}".format(avg_objects)) step += 1 sess.close() if len(fetches) > 1: total_objects = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: deco_print("Finished training") if step > bench_start: avg_time = 1.0 * total_time / num_bench_updates deco_print("Avg time per step: {:.3f}s".format(avg_time)) if len(fetches) > 1: avg_objects = 1.0 * total_objects / total_time deco_print("Avg objects per second: {:.3f}".format(avg_objects)) else: deco_print("Not enough steps for benchmarking")
def main(argv): args = parser.parse_args(argv[1:]) # handling commandline parameters logging.info("Cmdline Input: {}".format(argv)) TRAINING = args.training WITHPLOT = args.plot singleData = args.single FAKE = args.fake numberPrint = args.plotNo hyperParamFile = args.hyperparams saving = args.save loading = args.load augment = args.augment filterBool = args.filter overrideModelPath = args.overrideModel overrideInputPath = args.overrideInput usingCustomEstimator = args.custom displayWeights = args.dispWeights DEBUG = args.debug tensorboardDebugAddress = args.tensorboard_debug_address progressPlot = args.progressPlot maximumLossAnalysis = args.lossAna cancelThreshold = args.target # Commandline parameters sanity checks saveLoc = None if args.save is not None and args.load is not None: raise ValueError( "The --load and --save flags are mutually exclusive.") if args.save is not None and len(args.save) not in (0, 1): parser.error('Either give no values for save, or two, not {}.'.format(len(args.save))) elif args.save is not None: if len(args.save) == 0: # save to default location saveLoc = None elif len(args.save) == 1: # custom save location saveLoc = args.save[0] loadLoc = None if args.load is not None and len(args.load) not in (0, 1): parser.error('Either give no values for load, or one, not {}.'.format(len(args.load))) sys.exit(-1) elif args.load is not None: if len(args.load) == 0: # save to default location loadLoc = None elif len(args.load) == 1: # custom save location loadLoc = args.load[0] if args.separator is not None and FAKE: parser.error('No fake data for separator training (yet)') if args.separator is not None and len(args.separator) not in (0, 2): parser.error('Separator needs 2 Integers representing prediction Close off and separator position: given {}'.format(len(args.separator))) elif args.separator is not None: separator = True if len(args.separator) == 0: separatorPosition = 1550 predictionCutOff = 1300 else: separatorPosition = args.separator[0] predictionCutOff = args.separator[1] else: separator = False if cancelThreshold is not None and not TRAINING: logging.warning("target parameter is not useful when not in training") time_stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H.%M.%S') # load hyperparameters from hyperparameter file try: hyper_params = load_params(hyperParamFile) STEPS_PER_EPOCH = hyper_params.train.steps_per_epoch EPOCHS = hyper_params.train.epochs BATCH_SIZE = hyper_params.train.batch_size FEATURE_SIZE = hyper_params.arch.feature_size ACTIVATION = hyper_params.arch.activation # "leaky_relu", "relu", "linear", TODO: "sigmoid", "tanh" dropout = hyper_params.arch.dropout_rate hidden_layers = hyper_params.arch.hidden_layers regularization = hyper_params.arch.regularization if regularization is None or regularization.lower() == "no": l1regularization = False l2regularization = False elif regularization.lower() == "l1": l1regularization = True l2regularization = False elif regularization.lower() == "l2": l1regularization = False l2regularization = True else: raise AttributeError('invalid string in hyper_params.arch.regularization') if FAKE: FAKE_DATA_AMOUNT = hyper_params.data.numberFakeLines if augment: MIDPOINT = hyper_params.data.augmentMidpoint MIRRORRANGE = hyper_params.data.augmentRange testSize = hyper_params.data.testSize limits = hyper_params.data.limits elementsDirection = hyper_params.data.direction if elementsDirection.lower() == "y": elementsDirectionBool = True elif elementsDirection.lower() == "x": elementsDirectionBool = False unitLocDirection = hyper_params.data.unitLoc unitTimeDirection = hyper_params.data.unitTime units = {'loc': unitLocDirection, 'time':unitTimeDirection} optimizer = hyper_params.train.optimizer # "Adam", "Adagrad" learningRate = hyper_params.train.learning_rate decaySteps = hyper_params.train.decay_steps if overrideInputPath is None: dataFolder = hyper_params.problem.data_path else: dataFolder = overrideInputPath baseModelPath = hyper_params.problem.modelBasePath baseImagePath = hyper_params.problem.imagePath if args.separator is None: if hyper_params.problem.separator == 1: separator = True separatorPosition = hyper_params.problem.separatorPosition predictionCutOff = hyper_params.problem.predictionCutOff thresholdPoint = hyper_params.problem.thresholdPoint else: separator = False except AttributeError as err: logging.error("Error in Parameters. Maybe mistake in hyperparameter file?") logging.error("AttributeError: {0}".format(err)) sys.exit(1) except Exception as e: logging.error("Some kind of error? not sure: {}".format(e)) sys.exit(1) if loading is None: # Generate feature-label-pairs from given csv track files based on given parameters if not FAKE and not separator: (F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasNextStep(dataFolder, FEATURE_SIZE, testSize) elif separator: (F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasSeparation(dataFolder, FEATURE_SIZE, testSize, separatorPosition, predictionCutOff, elementsDirectionBool) if filterBool: F_train = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool) F_test = filterDataForIntersection(F_test, thresholdPoint, elementsDirectionBool) L_train = L_train.loc[F_train.index] L_test = L_test.loc[F_test.index] else: (F_train, L_train), (F_test, L_test) = ld.loadFakeDataPandas(FEATURE_SIZE, FAKE_DATA_AMOUNT, testSize) # TODO: ziemlich unschön - das könnte man noch besser machen if singleData: F_train = pd.concat([F_train, F_test]) F_test = F_train L_train = pd.concat([L_train, L_test]) L_test = L_train # ExTODO: find Augmentation MIDPOINT from data or as argument? - from Argument # Applying augmentation to feature-label-pairs if augment: logging.info("applying augmentation to Training Set...") if separator: F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool) else: F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool) state = random.randint(1, 101) F_train = F_train.sample(frac=1, random_state=state) L_train = L_train.sample(frac=1, random_state=state) logging.info("done!") # Network Design # -------------- my_feature_columns = [] columnNames = ld.genColumnNames(FEATURE_SIZE) for key in columnNames: my_feature_columns.append(tf.feature_column.numeric_column(key=key)) if not overrideModelPath: MODEL_PATH = baseModelPath # genModelPath(hyper_params, FAKE, usingCustomEstimator, separator) else: MODEL_PATH = overrideModelPath logging.info("time: {}".format(time_stamp)) logging.info('Saving to %s' % MODEL_PATH) # Preparing the initialisation of the estimator if optimizer == 'Adagrad': opti = tf.train.AdagradOptimizer elif optimizer == 'Adam': opti = tf.train.AdamOptimizer # elif optimizer == 'GradientDescent': # opti = tf.train.GradientDescentOptimizer else: logging.error("No (or wrong) optimizer given in hyperparameter file") sys.exit(-1) if ACTIVATION == 'relu': acti = tf.nn.relu elif ACTIVATION == 'leaky_relu': acti = tf.nn.leaky_relu elif ACTIVATION == 'linear': acti = None else: logging.error("No (or wrong) activation function given in hyperparameter file") sys.exit(-1) # File System preparation: check if right folders exist and create them if they dont if not os.path.exists(MODEL_PATH): os.makedirs(MODEL_PATH) logging.info("model folder {} does not exist. Creating folder".format(MODEL_PATH)) elif os.path.exists(MODEL_PATH) and not os.path.isdir(MODEL_PATH): logging.error("There is a file in the place where one would like to save their files..") sys.exit(1) if not os.path.exists(baseImagePath): os.makedirs(baseImagePath) logging.info("image folder: {} does not exist. Creating folder".format(MODEL_PATH)) if not os.path.exists(MODEL_PATH + '/' + os.path.basename(hyperParamFile)): shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(MODEL_PATH + hyperParamFile)) # print("new hyperParam File written") else: shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(hyperParamFile)[:-5] + time_stamp + ".json") # print("added another version of hyper param file") # Saving the generated feature-label-pairs for future use if saving is not None: logging.info("storing data in {}".format(saveLoc)) if saveLoc is None: saveLoc = MODEL_PATH + '/data.h5' with pd.HDFStore(saveLoc) as store: store['xtrain'] = F_train store['ytrain'] = L_train store['xtest'] = F_test store['ytest'] = L_test store['labelMeans'] = labelMeans store['labelStds'] = labelStds # loading a set of pregenerated feature-label-pairs for usage if loading is not None: try: if loadLoc is None: loadLoc = MODEL_PATH + '/data.h5' logging.info("loading data from {}.".format(loadLoc)) with pd.HDFStore(loadLoc) as store: F_train = store['xtrain'] L_train = store['ytrain'] F_test = store['xtest'] L_test = store['ytest'] labelMeans = store['labelMeans'] labelStds = store['labelStds'] except Exception as e: logging.error("Error while loading from stored data: {}".format(e)) sys.exit(1) assert not F_train.index.duplicated().any() assert not L_train.index.duplicated().any() assert not F_test.index.duplicated().any() assert not L_test.index.duplicated().any() # Plot progress Vars - more or less deprecated, but could be updated for current state if progressPlot: pos = [int(i * EPOCHS/10) for i in range(1, 10)] debugVisualizerIndex = random.randint(1, F_test.shape[0]) featureVals = F_test.iloc[[debugVisualizerIndex]] labelVals = L_test.iloc[[debugVisualizerIndex]] predictions = [] if not usingCustomEstimator: # Validation and Test Configuration logging.info("using premade Estimator") test_config = estimator.RunConfig(save_checkpoints_steps=50000, save_checkpoints_secs=None, save_summary_steps=100) regressor = estimator.DNNRegressor(feature_columns=my_feature_columns, label_dimension=2, hidden_units=hidden_layers, model_dir=MODEL_PATH, dropout=dropout, activation_fn=acti, config=test_config, optimizer=opti(learning_rate=learningRate) ) else: logging.info("using custom estimator") test_config = estimator.RunConfig(save_checkpoints_steps=100000, save_checkpoints_secs=None, save_summary_steps=500) useRatioScaling = False # Todo: überlegen ob es hierfür noch eine sinnvolle verwendung gibt if separator and useRatioScaling: medianDim1 = L_train.iloc[:,0].median() medianDim2 = L_train.iloc[:,1].median() ratio = medianDim1 / medianDim2 scaleDim1 = 1.0 scaleDim2 = ratio logging.info("scaling loss between different dimensions. ScaleDim2-Ratio: {}".format(ratio)) else: scaleDim1 = 1.0 scaleDim2 = 1.0 regressor = estimator.Estimator( model_fn=cE.myCustomEstimator, config=test_config, model_dir=MODEL_PATH, params={ "feature_columns": my_feature_columns, "learning_rate": learningRate, "optimizer": opti, "hidden_units": hidden_layers, "dropout": dropout, "activation": acti, "decaying_learning_rate": True, "decay_steps": decaySteps, "l1regularization": l1regularization, "l2regularization": l2regularization, "scaleDim1": scaleDim1, "scaleDim2": scaleDim2, "regularizationStrength": 5e-08 }) hooks = None # Debug hooks are handled here if DEBUG and tensorboardDebugAddress: raise ValueError( "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if DEBUG: hooks = [tf_debug.LocalCLIDebugHook()] # Start tensorboard with debugger port argument: "tensorboard --logdir=./debug2 --debugger_port 6007" elif tensorboardDebugAddress: hooks = [tf_debug.TensorBoardDebugHook(tensorboardDebugAddress)] # hooks = [debug_hook] logging.info("Train: ({}, {})".format(F_train.shape, L_train.shape)) logging.info("Test: ({}, {})".format(F_test.shape, L_test.shape)) logging.info("Means: \n{}".format(labelMeans)) logging.info("Stds: \n{}".format(labelStds)) # Train it if TRAINING: if not os.path.exists(MODEL_PATH + '/meanstd.pkl'): with open(MODEL_PATH + "/meanstd.pkl", 'wb') as f: pickle.dump([labelMeans, labelStds], f) else: with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f: [labelMeansTemp, labelStdsTemp] = pickle.load(f) if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float? logging.warning("CAREFUL: LabelMeans or LabelStds do not match existing values! Training with new values") logging.info('Train the DNN Regressor...\n') # test = tf.train.get_or_create_global_step() # logging.info("test: {}".format(test)) epochInterm = [] startTimeTraining = timer() for epoch in range(EPOCHS): # Fit the DNNRegressor # regressor.train(input_fn=training_input_fn(batch_size=BATCH_SIZE), steps=STEPS_PER_EPOCH) regressor.train(input_fn=lambda: training_input_fn_Slices(F_train, L_train, BATCH_SIZE), steps=STEPS_PER_EPOCH, hooks=hooks) # Start Tensorboard in Terminal: # tensorboard --logdir='./DNNRegressors/' # Now open Browser and visit localhost:6006\ if epoch % 10 == 0: logging.info("Progress: epoch " + str(epoch)) # logging.info("Progress: global step: {}".format(tf.train.get_global_step())) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info("eval: " + str(eval_dict)) avgLoss = eval_dict['average_loss'] epochInterm.append(avgLoss) # optional canceling of training upon hitting a specified loss threshold if cancelThreshold is not None: if avgLoss < cancelThreshold: logging.info("reached cancel Threshold. finishing training") break if progressPlot and epoch in pos: # TODO: adapt or remove because of standardize and normalize debug_pred = regressor.predict(input_fn=lambda: eval_input_fn(featureVals, labels=None, batch_size=BATCH_SIZE)) debug_predicted = [p['predictions'] for p in debug_pred] predictions.append(debug_predicted) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info("Training completed. final average loss: {}, best average loss during training: {}".format( eval_dict['average_loss'], min(epochInterm))) endTimeTraining = timer() timeTotal = endTimeTraining - startTimeTraining hours = timeTotal // 3600 timeTotal %= 3600 minutes = timeTotal // 60 timeTotal %= 60 logging.info("Total Training time: {}h {}min {}s".format(int(hours), int(minutes), int(timeTotal))) if progressPlot: if FAKE: savePath = '/home/hornberger/testFake' else: savePath = '/home/hornberger/testReal' plotTrainDataPandas(featureVals, labelVals, predictions, savePath, units) # Evaluation/Prediction else: logging.info('No training today, just prediction') if not os.path.exists(MODEL_PATH + '/meanstd.pkl'): logging.warning("Careful: No prior LabelMeans or LabelStds found!") else: with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f: [labelMeansTemp, labelStdsTemp] = pickle.load(f) if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float? logging.warning("evaluation on different dataset. replacing current labelMeans and labelStds") L_test = L_test * labelStds + labelMeans labelMeans = labelMeansTemp labelStds = labelStdsTemp logging.info("New labelMeans: \n{}".format(labelMeans)) logging.info("New labelStds: \n{}".format(labelStds)) L_test = (L_test - labelMeans) / labelStds try: # Prediction eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE)) logging.info('Error on whole Test set:\nMSE (tensorflow): {}'.format(eval_dict['average_loss'])) averageLoss = eval_dict['average_loss'] except ValueError as err: # probably failed to load model logging.error("{}".format(err)) sys.exit(1) except Exception as e: logging.error("Unknown Error while trying to evaluate: {}".format(e)) sys.exit(1) assert numberPrint < L_test.shape[0] sampleIndex = random.randint(0, L_test.shape[0] - numberPrint) # x_pred2 = F_test.iloc[[sampleIndex + i for i in range(numberPrint)]] # y_vals2 = L_test.iloc[[sampleIndex + i for i in range(numberPrint)]] x_pred2 = F_test.sample(n=numberPrint, random_state=sampleIndex) y_vals2 = L_test.sample(n=numberPrint, random_state=sampleIndex) y_vals2Denormalized = y_vals2.copy() for k in L_test.columns: y_vals2Denormalized[k] = y_vals2Denormalized[k] * labelStds[k] + labelMeans[k] print(x_pred2) print(y_vals2 * labelStds + labelMeans) startTime = timer() y_predGen = regressor.predict(input_fn=lambda: eval_input_fn(x_pred2, labels=None, batch_size=BATCH_SIZE)) y_predicted = [p['predictions'] for p in y_predGen] endTime = timer() print("predicted: ") y_predictedCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in y_predicted] # Look, ye mighty, and despair! for i in y_predictedCorr: print(i) print("time: {:.2f}s".format((endTime - startTime))) eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(x_pred2, y_vals2, batch_size=BATCH_SIZE)) print('MSE (tensorflow): {}'.format(eval_dict['average_loss'])) # Maximum Loss Analysis: display the X worst predictions of the testset if maximumLossAnalysis: if not separator: printDF = prepareMaximumLossAnalysisNextStep(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds) plotDataNextStepPandas(numberPrint, printDF[columnNames], printDF[['LabelX', 'LabelY']], printDF[['PredictionX', 'PredictionY']], baseImagePath, limits, units, os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf') else: printDF = prepareMaximumLossAnalysisSeparator(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds) # printDF['LabelPosBalken'] = printDF['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken'] plotDataSeparatorPandas(numberPrint, printDF[columnNames], printDF[['LabelPosBalken']], separatorPosition, printDF[['PredictionIntersect']], baseImagePath, limits, units, elementsDirectionBool, os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf') # print(printDF) # displaying weights in Net - (a bit redundant after implementation of debugger) if displayWeights: for variable in regressor.get_variable_names(): logging.info("name: \n{}\nvalue: \n{}\n".format(variable, regressor.get_variable_value(variable))) weights = regressor.get_variable_value('dense/kernel') plt.imshow(weights, cmap='coolwarm') plt.show() # # Final Plot if WITHPLOT: L_trainDenormalized = L_train * labelStds + labelMeans L_testDenormalized = L_test * labelStds + labelMeans if not separator: plotDataNextStepPandas(numberPrint, x_pred2, y_vals2Denormalized, y_predictedCorr, baseImagePath, limits, units, os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf') totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE)) totalPredictions = [p['predictions'] for p in totalPredictGen] totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair! evaluateResultNextStep(F_test, L_testDenormalized, totalPredictionsCorr, units, baseImagePath) else: # y_vals2Denormalized = y_vals2['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken'] # y_predictedCorr = list(map(lambda x: [v * labelStds[k] + labelMeans[k] for k,v in enumerate(x)], y_predicted)) plotDataSeparatorPandas(numberPrint, x_pred2, y_vals2Denormalized['LabelPosBalken'], separatorPosition, y_predictedCorr, baseImagePath, limits, units, elementsDirectionBool, os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf') totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE)) totalPredictions = [p['predictions'] for p in totalPredictGen] totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair! filteredFeatures = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool) medianAccel = getMedianAccel(filteredFeatures, separator, elementsDirectionBool) optimalAccel = getOptimalAccel(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool) bias = getCVBias(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool) configDict = {'medAc': medianAccel, 'optAc': optimalAccel, 'cvBias': bias} evaluateResultSeparator(F_test, L_testDenormalized, totalPredictionsCorr, separatorPosition, thresholdPoint, configDict, units, baseImagePath, elementsDirectionBool)
def train(args: Namespace, data_params: MoleculeData, experiment: Experiment, mol_metrics: GraphMolecularMetrics) -> None: ds_train = create_dataflow(args.data_dir, 'train', args.batch_size) ds_train_repeat = PrefetchDataZMQ(ds_train, nr_proc=1) # times 2, because we consume 2 batches per step ds_train_repeat = RepeatedData(ds_train_repeat, 2 * args.epochs) train_input_fn = experiment.make_train_fn(ds_train_repeat, args.batch_size, args.num_latent, data_params) def hooks_fn(train_ops: MolGANTrainOps, train_steps: tfgan.GANTrainSteps) -> EstimatorTrainHooks: if train_ops.valuenet_train_op is not None: generator_hook = FeedableTrainOpsHook( train_ops.generator_train_op, train_steps.generator_train_steps, train_input_fn, return_feed_dict=False) discriminator_hook = WithRewardTrainOpsHook([ train_ops.discriminator_train_op, train_ops.valuenet_train_op ], train_steps.discriminator_train_steps, train_input_fn, mol_metrics) else: generator_hook = FeedableTrainOpsHook( train_ops.generator_train_op, train_steps.generator_train_steps, train_input_fn, return_feed_dict=True) discriminator_hook = FeedableTrainOpsHook( train_ops.discriminator_train_op, train_steps.discriminator_train_steps, train_input_fn) return [generator_hook, discriminator_hook] model = experiment.make_model_fn(args, data_params, hooks_fn) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True # enable XLA JIT # sess_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config = tf.estimator.RunConfig(model_dir=str(args.model_dir), session_config=sess_config, save_summary_steps=ds_train.size(), save_checkpoints_secs=None, save_checkpoints_steps=4 * ds_train.size(), keep_checkpoint_max=2) estimator = tf.estimator.Estimator(model.model_fn, config=config) train_hooks = [PrintParameterSummary()] if args.restore_from_checkpoint is not None: train_hooks.append( RestoreFromCheckpointHook(str(args.restore_from_checkpoint))) if args.debug: from tensorflow.python import debug as tf_debug train_hooks.append(tf_debug.TensorBoardDebugHook("localhost:6064")) predict_fn = experiment.make_predict_fn(args.data_dir, args.num_latent, n_samples=1000, batch_size=1000) ckpt_listener = PredictAndEvalMolecule(estimator, predict_fn, mol_metrics, str(args.model_dir)) hparams_setter = [ ScheduledHyperParamSetter('generator_learning_rate:0', args.generator_learning_rate, [(80, 0.5 * args.generator_learning_rate), (150, 0.1 * args.generator_learning_rate), (200, 0.01 * args.generator_learning_rate)], steps_per_epoch=ds_train.size()), ScheduledHyperParamSetter( 'discriminator_learning_rate:0', args.discriminator_learning_rate, [(80, 0.5 * args.discriminator_learning_rate), (150, 0.1 * args.discriminator_learning_rate), (200, 0.01 * args.discriminator_learning_rate)], steps_per_epoch=ds_train.size()) ] train_hooks.extend(hparams_setter) if args.weight_reward_loss > 0: if args.weight_reward_loss_schedule == 'linear': lambda_setter = ScheduledHyperParamSetter( model.params, 'lam', [(args.reward_loss_delay, 1.0), (args.epochs, 1.0 - args.weight_reward_loss)], True) elif args.weight_reward_loss_schedule == 'const': lambda_setter = ScheduledHyperParamSetter( model.params, 'lam', [(args.reward_loss_delay + 1, 1.0 - args.weight_reward_loss)], False) else: raise ValueError('unknown schedule: {!r}'.format( args.weight_reward_loss_schedule)) hparams_setter.append(lambda_setter) train_start = time.time() estimator.train(train_input_fn, hooks=train_hooks, saving_listeners=[ckpt_listener]) train_end = time.time() time_d = datetime.timedelta(seconds=int(train_end - train_start)) LOG.info('Training for %d epochs finished in %s', args.epochs, time_d)
def train(): """Trains the model.""" if args.verbose: tf.logging.set_verbosity(tf.logging.INFO) files = pc_io.get_files(args.train_glob) points = pc_io.load_points(files) files_cat = np.array( [os.path.split(os.path.split(x)[0])[1] for x in files]) for cat in files_cat: assert (cat == 'train') or (cat == 'eval') TRAIN_DATASET = points[files_cat == 'train'] EVAL_DATASET = points[files_cat == 'eval'] assert (len(TRAIN_DATASET) + len(EVAL_DATASET) == len(points)) config = tf.estimator.RunConfig( keep_checkpoint_every_n_hours=1, save_checkpoints_secs=args.save_checkpoints_secs, # 600 keep_checkpoint_max=args.keep_checkpoint_max, # 50 log_step_count_steps=args.log_step_count_steps, # 100 save_summary_steps=args.save_summary_steps, # 100 tf_random_seed=42) estimator = tf.estimator.Estimator( model_fn=compression_model_2048.model_fn, model_dir=args.checkpoint_dir, config=config, params={ 'num_points': args.num_point, 'batch_size': args.batch_size, 'knn': args.knn, 'alpha': args.alpha, 'gamma': args.gamma, 'lmbda': args.lmbda, 'additional_metrics': not args.no_additional_metrics, 'checkpoint_dir': args.checkpoint_dir, 'data_format': DATA_FORMAT # channels_first }) hooks = None if args.debug_address is not None: hooks = [tf_debug.TensorBoardDebugHook(args.debug_address)] train_spec = tf.estimator.TrainSpec( input_fn=lambda: compression_model_2048.input_fn( TRAIN_DATASET, args.batch_size, args.preprocess_threads, prefetch_size=args.prefetch_size), max_steps=args.max_steps, hooks=hooks) val_spec = tf.estimator.EvalSpec( input_fn=lambda: compression_model_2048.input_fn( EVAL_DATASET, args.batch_size, args.preprocess_threads, repeat=False, prefetch_size=args.prefetch_size), steps=None, hooks=hooks) # tf.estimator.train_and_evaluate(estimator, train_spec, val_spec)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tf_run_config = tf.estimator.RunConfig( model_dir=configDir["model_dir"], tf_random_seed=None, save_summary_steps=configDir["save_summary_steps"], save_checkpoints_steps=configDir["save_checkpoints_steps"], session_config=None, keep_checkpoint_max=configDir["keep_checkpoint_max"], log_step_count_steps=configDir["print_loss_steps"], train_distribute=None, device_fn=None) num_train_steps = None num_warmup_steps = None if configDir["do_train"]: train_examples = len(os.listdir(configDir["train_input"])) * 1000 num_train_steps = int(train_examples / configDir["train_batch_size"] * configDir["num_train_epochs"]) num_warmup_steps = int(num_train_steps * configDir["warmup_proportion"]) model_fn = model_build.model_fn_builder( config=configDir, model_config=model_config, learning_rate=configDir["learning_rate"], num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) estimator = tf.estimator.Estimator( model_fn=model_fn, config=tf_run_config, params={ "train_batch_size": configDir["train_batch_size"], "eval_batch_size": configDir["val_batch_size"], "predict_batch_size": configDir["test_batch_size"] }, # params可以传给mofel_fn和input_fn warm_start_from=None, ) # 是否生成推断模型。 if configDir["save_predict_model_for_tfServing"] == 1: serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( { "unid": tf.FixedLenFeature([], tf.int64), "image/encoded": tf.FixedLenFeature([], tf.string), "label": tf.FixedLenFeature([], tf.int64), }) estimator.export_savedmodel(configDir["TFServing_model_path"], serving_input_receiver_fn, strip_default_attrs=True) return 0 if configDir["do_train"]: trainHookLt = [] evalHookLt = [] if configDir["debug"]: debug_config = configDir["debug_config"] if debug_config["tfdbg"]: trainHookLt.append(tfdbg.LocalCLIDebugHook()) elif configDir["tfdbgtensorboard"]: trainHookLt.append( tfdbg.TensorBoardDebugHook( grpc_debug_server_addresses="localhost:11111")) if configDir["file_base"]: train_input_fn = model_input.file_based_input_fn_builder( input_file=configDir["train_input"], is_training=True, drop_remainder=True, batch="train_batch_size") val_input_fn = model_input.file_based_input_fn_builder( input_file=configDir["val_input"], is_training=False, drop_remainder=True, batch="eval_batch_size") else: augment_fn = CIFAR10Policy() train_genter_fn = model_input.get_generator_fn( configDir, configDir["train_input"], True, augment_fn) train_input_fn = model_input.input_fn_builder( configDir, train_genter_fn, True, True, "train_batch_size") # input_files = os.listdir(os.path.join(configDir["DP"], "test")) val_genter_fn = model_input.get_generator_fn( configDir, configDir["val_input"], False) val_input_fn = model_input.input_fn_builder( configDir, val_genter_fn, False, True, "eval_batch_size") trainSpec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_train_steps, hooks=trainHookLt) valSpec = tf.estimator.EvalSpec( input_fn=val_input_fn, steps=configDir["trainStepVal"], throttle_secs=configDir["throttle_secs"], hooks=evalHookLt) tf.estimator.train_and_evaluate(estimator=estimator, train_spec=trainSpec, eval_spec=valSpec) if configDir["do_test"]: tf.logging.info("***** Running predictions *****") tf.logging.info(" Batch size = %d", configDir["test_batch_size"]) # input_files = os.listdir(os.path.join(configDir["DP"], "test")) if configDir["file_base"]: predict_input_fn = model_input.file_based_input_fn_builder( input_file=configDir["predict_input"], is_training=False, drop_remainder=True, batch="predict_batch_size") else: predict_genter_fn = model_input.get_generator_fn( configDir, configDir["predict_input"], False) predict_input_fn = model_input.input_fn_builder( configDir, predict_genter_fn, False, True, "predict_batch_size") wf = open(configDir["test_res_output"], "w", encoding="utf-8") for mm, result in enumerate( estimator.predict( predict_input_fn, yield_single_examples=True, hooks=[ # tfdbg.LocalCLIDebugHook(), # tfdbg.TensorBoardDebugHook(grpc_debug_server_addresses="localhost:11111"), ])): tf.logging.info("Processing example: %d" % (mm)) #------------临时代码------------------------# if mm == 10: break #------------临时代码------------------------# example_id = result["unique_ids"] predict = result["predict"] label = result["label"] category_probility = "_".join( [str(i) for i in result["category_probility"].tolist()]) path = result["path"].decode('utf-8') wf.write("{}\t{}\t{}\t{}\t{}\n".format(example_id, predict, label, category_probility, path)) if configDir["do_save_conv_image"]: conv_image = filter_conv_image(result) if not os.path.exists(configDir["conv_image_path"]): os.makedirs(configDir["conv_image_path"]) numpy_path = os.path.join(configDir["conv_image_path"], os.path.basename(path)[:-4]) np.savez(numpy_path, **conv_image) if configDir["grad_cam"]: cam = result["cam"] image = result["image"] cam = cam / np.max(cam) cam = cv2.resize(cam, (configDir["resize"], configDir["resize"])) image = image / 255 # Superimposing the visualization with the image. show_cam_on_image(image, cam) wf.close()
def train(train_model, eval_model=None, hvd=None, debug_port=None): if eval_model is not None and 'eval_steps' not in eval_model.params: raise ValueError("eval_steps parameter has to be specified " "if eval_model is provided") if hvd: master_worker = hvd.rank() == 0 else: master_worker = True # initializing session parameters sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True if hvd is not None: sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) # defining necessary hooks hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)] if hvd is not None: hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if master_worker: checkpoint_dir = train_model.params['logdir'] else: checkpoint_dir = None if master_worker: if train_model.params['save_checkpoint_steps'] is not None: # noinspection PyTypeChecker hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir, save_steps=train_model.params['save_checkpoint_steps'])) if train_model.params['print_loss_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintSamplesHook( every_steps=train_model.params['print_samples_steps'], model=train_model, )) if eval_model is not None: # noinspection PyTypeChecker hooks.append( RunEvaluationHook( every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, ), ) total_time = 0.0 bench_start = train_model.params.get('bench_start', 10) if debug_port: hooks.append( tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port))) # starting training with tf.train.MonitoredTrainingSession( checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks, ) as sess: for step, feed_dict in enumerate( train_model.data_layer.iterate_forever()): if sess.should_stop(): break tm = time.time() sess.run(fetches=train_model.train_op, feed_dict=feed_dict) if step >= bench_start: total_time += time.time() - tm if hvd is not None: deco_print("Finished training on rank {}".format(hvd.rank())) else: deco_print("Finished training") if step > bench_start: deco_print("Avg time per step: {:.3}s".format(1.0 * total_time / (step - bench_start))) else: deco_print("Not enough steps for benchmarking")
def main(argv=None): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',') num_gpus = len(available_gpus) print("num_gpus : ", num_gpus, available_gpus) with tf.Graph().as_default(): # Get Network class and Optimizer global_step = tf.train.get_or_create_global_step() # Learning rate decay if "SynthText" in FLAGS.train_path: boundaries = [40000, 60000] else: boundaries = [4000, 8000] values = [FLAGS.learning_rate / pow(10, i) for i in range(3)] learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) tf.summary.scalar('learning_rate', learning_rate) Hmean = tf.Variable(0.0, trainable=False, name='hmean') tf.summary.scalar("Hmean", Hmean) optimizers = [] net = RetinaNet(FLAGS.backbone) # Multi gpu training code (Define graph) tower_grads = [] tower_extra_update_ops = [] #tower_train_errs = [] tower_loc_losses = [] tower_cls_losses = [] input_features = net.get_input(is_train=True, num_gpus=num_gpus) for gpu_indx in range(num_gpus): tower_output = _single_tower(net, gpu_indx, input_features[gpu_indx], learning_rate) tower_grads.append( [x for x in tower_output.grads if x[0] is not None]) tower_extra_update_ops.append(tower_output.extra_update_ops) # tower_train_errs.append(tower_output.error) tower_loc_losses.append(tower_output.loc_loss) tower_cls_losses.append(tower_output.cls_loss) optimizers.append(tower_output.optimizer) if FLAGS.use_validation: valid_input_feature = net.get_input(is_train=False, num_gpus=1) # single gpu validation valid_tower_output = _single_tower(net, FLAGS.valid_device, valid_input_feature[0], name='valid') tf.summary.scalar("valid_loc_losses", valid_tower_output.loc_loss) tf.summary.scalar("valid_cls_losses", valid_tower_output.cls_loss) # Merge results loc_losses = tf.reduce_mean(tower_loc_losses) cls_losses = tf.reduce_mean(tower_cls_losses) grads = allreduce_grads(tower_grads) train_ops = [] tf.summary.scalar("train_loc_losses", loc_losses) tf.summary.scalar("train_cls_losses", cls_losses) tf.summary.image("train_img", input_features[0].image) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_ops.append(variables_averages_op) # Apply the gradients for idx, grad_and_vars in enumerate(grads): with tf.name_scope('apply_gradients'), tf.device( tf.DeviceSpec(device_type="GPU", device_index=idx)): # apply_gradients may create variables. Make them LOCAL_VARIABLES from tensorpack.graph_builder.utils import override_to_local_variable with override_to_local_variable(enable=idx > 0): train_ops.append(optimizers[idx].apply_gradients( grad_and_vars, name='apply_grad_{}'.format(idx), global_step=(global_step if idx == 0 else None))) with tf.control_dependencies(tower_extra_update_ops[-1]): train_op = tf.group(*train_ops, name='train_op') # Summary summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge( [s for s in summaries if 'valid_' not in s.name]) if FLAGS.use_validation: valid_summary_op = tf.summary.merge( [s for s in summaries if 'valid_' in s.name]) valid_summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.output, FLAGS.valid_dataset)) ''' # Print network structure if not os.path.exists(FLAGS.output): os.makedirs(os.path.join(FLAGS.output,'best_models'), exist_ok=True) param_stats = tf.profiler.profile(tf.get_default_graph()) sys.stdout.write('total_params: %d\n' % param_stats.total_parameters) train_info = open(os.path.join(FLAGS.output, 'train_info.txt'),'w') train_info.write('total_params: %d\n' % param_stats.total_parameters) train_info.write(str(FLAGS.flag_values_dict())) train_info.close() ''' # Print configuration pprint(FLAGS.flag_values_dict()) # Define config, init_op, scaffold session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) pretrain_op = load_pytorch_weight(FLAGS.use_bn, net.use_se_block) sync_op = _get_post_init_ops() # only save global variables saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) scaffold = tf.train.Scaffold(saver=saver, init_op=init_op, summary_op=summary_op, init_fn=_get_init_pretrained()) valid_saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) best_valid_loss = 1e9 best_valid_acc = -1 # Define several hooks hooks = [] if FLAGS.use_profile: profiler_hook = tf.train.ProfilerHook(save_steps=FLAGS.valid_steps, output_dir=FLAGS.output) hooks.append(profiler_hook) if FLAGS.use_debug: from tensorflow.python import debug as tf_debug # CLI Debugger # cli_debug_hook = tf_debug.LocalCLIDebugHook() # hooks.append(cli_debug_hook) # Tensorboard Debugger tfb_debug_hook = tf_debug.TensorBoardDebugHook("127.0.0.1:9900") #tfb_debug_hook = tf_debug.TensorBoardDebugHook("a476cc765f91:6007") hooks.append(tfb_debug_hook) hooks = None if len(hooks) == 0 else hooks reset_global_step = tf.assign(global_step, 0) pEval = None print("---------- session start") with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.output, scaffold=scaffold, hooks=hooks, config=session_config, save_checkpoint_steps=FLAGS.valid_steps, save_checkpoint_secs=None, save_summaries_steps=FLAGS.summary_steps, save_summaries_secs=None, ) as sess: print("---------- open MonitoredTrainingSession") if "ICDAR2015" in FLAGS.train_path: sess.run(reset_global_step) _step = sess.run(global_step) if "SynthText" in FLAGS.train_path: print("---------- run pretrain op") sess.run(pretrain_op) print("---------- run sync op") sess.run(sync_op) print("---------- start training, step=", _step) while _step < FLAGS.max_num_steps: if sess.should_stop(): print("Done! ", _step) break # Training [step_loc_loss, step_cls_loss, _, _step ] = sess.run([loc_losses, cls_losses, train_op, global_step]) print( 'STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f' % (_step, step_loc_loss + step_cls_loss, step_loc_loss, step_cls_loss), end='\r') if _step % 50 == 0: print( 'STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f' % (_step, step_loc_loss + step_cls_loss, step_loc_loss, step_cls_loss)) # Periodic synchronization if _step % 1000 == 0: sess.run(sync_op) # Validation Err if FLAGS.use_validation: [valid_step_loc_loss, valid_step_cls_loss, valid_summary] = sess.run([ valid_tower_output.loc_loss, valid_tower_output.cls_loss, valid_summary_op ]) if valid_summary_writer is not None: valid_summary_writer.add_summary(valid_summary, _step) print( 'STEP : %d\tVALID_TOTAL_LOSS : %.8f\tVALID_LOC_LOSS : %.8f\tVALID_CLS_LOSS : %.5f' % (_step, valid_step_loss, valid_step_loc_loss, valid_step_cls_loss)) print('=' * 70) # Evaluation on ICDAR2015 if FLAGS.use_evaluation and _step % FLAGS.valid_steps == 0: if "ICDAR2015" in FLAGS.train_path: # reset global step -> scaffold auto save is not working! saver.save(_get_session(sess), os.path.join(FLAGS.output, 'model.ckpt'), global_step=_step) try: if pEval is None: print( "Evaluation started at iteration {} on IC15..." .format(_step)) eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(FLAGS.valid_device) + \ " python test.py" + \ " --tune_from=" + os.path.join(FLAGS.output, 'model.ckpt-') + str(_step) + \ " --input_size=1024" + \ " --output_zip=result_" + FLAGS.test + \ " --test=" + FLAGS.test + \ " --nms_thresh=0.25" print(eval_cmd) pEval = Popen(eval_cmd, shell=True, stdout=PIPE, stderr=PIPE) elif pEval.poll() is not None: (scorestring, stderrdata) = pEval.communicate() hmean = float( str(scorestring).strip().split(":")[3].split( ",")[0].split("}")[0].strip()) if hmean > best_valid_acc: best_valid_acc = hmean best_model_dir = os.path.join( FLAGS.output, 'best_models') valid_saver.save( _get_session(sess), os.path.join(best_model_dir, 'model_%.2f' % (hmean * 100)), global_step=_step) print("test_hmean for {}-th iter : {:.4f}".format( _step, hmean)) sess.run(tf.assign(Hmean, hmean)) if pEval is not None: pEval.kill() pEval = None except Exception as e: print("exception happened in evaluation ", e) if pEval is not None: pEval.kill() pEval = None
def main(): tf.logging.set_verbosity(tf.logging.INFO) args = parse_arguments() # graph_data = load_data_node2vec() graph_data = load_data_node2vec(args.data_dir) vertex_embedding_params = { 'embedding_dim': args.embedding_dim, 'embedding_trainable': False, 'embedding_checkpoint': tf.train.latest_checkpoint(args.embedding_dir), } model = make_multilabel_logistic_regression( label_task_weight=1.0, regularization=args.global_regularization, global_optimizer=make_optimizer(args), polyak=False) hooks = [ tf.train.LoggingTensorHook( { 'kappa_insample': 'kappa_insample_batch/value', 'kappa_outsample': 'kappa_outsample_batch/value' }, every_n_secs=30) ] node_classifier = tf.estimator.Estimator( model_fn=model, params={ **vertex_embedding_params, 'num_vertices': graph_data.num_vertices, 'n_labels': graph_data.num_labels, 'batch_size': args.batch_size }, model_dir=args.train_dir) if args.profile: hooks.append(tf.train.ProfilerHook(save_secs=30)) if args.debug: from tensorflow.python import debug as tfdbg hooks.append(tfdbg.TensorBoardDebugHook('localhost:6004')) # train model dataset_fn_train = get_dataset_fn(args.sampler, args) node_classifier.train(input_fn=make_input_fn(graph_data, args, dataset_fn_train), max_steps=args.max_steps_logistic, hooks=hooks) pred_features = { 'vertex_index': np.expand_dims(np.array(range(graph_data.num_vertices)), 1) } def make_pred_dataset(): dataset = tf.data.Dataset.from_tensor_slices(pred_features) return dataset print('======= Computing Predictions for logistic regression ========') predictions = node_classifier.predict(input_fn=make_pred_dataset, yield_single_examples=False) # get test set rng = np.random.RandomState(args.seed) in_train = rng.binomial(1, 1 - args.proportion_censored, size=graph_data.num_vertices).astype(np.int32) in_test = np.logical_not(in_train) pred_prob_list = [] for prediction in predictions: pred_prob_list += [prediction['probabilities']] pred_probs = np.concatenate(pred_prob_list) num_labels = graph_data.labels.shape[1] classes = np.array(range(num_labels)) top_k_list = list(np.sum(graph_data.labels[in_test], 1).astype(np.int)) pred_labels = predict(pred_probs[in_test], classes, top_k_list) mlb = MultiLabelBinarizer(classes) pred_labels = mlb.fit_transform(pred_labels) print('======= Result for logistic regression ========') f1_macro = f1_score(graph_data.labels[in_test], pred_labels, average='macro') f1_micro = f1_score(graph_data.labels[in_test], pred_labels, average='micro') print("f1_macro: {}".format(f1_macro)) print("f1_micro: {}".format(f1_micro)) # test model dataset_fn_test = get_dataset_fn( args.sampler_test if args.sampler_test is not None else args.sampler, args) node_classifier.evaluate(input_fn=make_input_fn(graph_data, args, dataset_fn_test, 1000), hooks=hooks)
def train(): """Trains the model.""" if args.verbose: tf.logging.set_verbosity(tf.logging.INFO) p_min, p_max, dense_tensor_shape = pc_io.get_shape_data(args.resolution) files = pc_io.get_files(args.train_glob) points = pc_io.load_points(files, p_min, p_max) files_cat = np.array( [os.path.split(os.path.split(x)[0])[1] for x in files]) for cat in files_cat: assert (cat == 'train') or (cat == 'test') points_train = points[files_cat == 'train'] points_test = points[files_cat == 'test'] assert (len(points_train) + len(points_test) == len(points)) config = tf.estimator.RunConfig( keep_checkpoint_every_n_hours=1, save_checkpoints_secs=args.save_checkpoints_secs, keep_checkpoint_max=args.keep_checkpoint_max, log_step_count_steps=args.log_step_count_steps, save_summary_steps=args.save_summary_steps, tf_random_seed=42) estimator = tf.estimator.Estimator(model_fn=compression_model.model_fn, model_dir=args.checkpoint_dir, config=config, params={ 'num_filters': args.num_filters, 'alpha': args.alpha, 'gamma': args.gamma, 'lmbda': args.lmbda, 'additional_metrics': not args.no_additional_metrics, 'checkpoint_dir': args.checkpoint_dir, 'data_format': DATA_FORMAT }) hooks = None if args.debug_address is not None: hooks = [tf_debug.TensorBoardDebugHook(args.debug_address)] train_spec = tf.estimator.TrainSpec( input_fn=lambda: compression_model.input_fn(points_train, args.batch_size, dense_tensor_shape, args.preprocess_threads, prefetch_size=args. prefetch_size), max_steps=args.max_steps, hooks=hooks) val_spec = tf.estimator.EvalSpec( input_fn=lambda: compression_model.input_fn(points_test, args.batch_size, dense_tensor_shape, args.preprocess_threads, repeat=False, prefetch_size=args. prefetch_size), steps=None, hooks=hooks) tf.estimator.train_and_evaluate(estimator, train_spec, val_spec)
def train(train_model, eval_model=None, debug_port=None, custom_hooks=None): if eval_model is not None and 'eval_steps' not in eval_model.params: raise ValueError("eval_steps parameter has to be specified " "if eval_model is provided") hvd = train_model.hvd if hvd: master_worker = hvd.rank() == 0 else: master_worker = True # initializing session parameters sess_config = tf.ConfigProto(allow_soft_placement=True) # pylint: disable=no-member sess_config.gpu_options.allow_growth = True if hvd is not None: # pylint: disable=no-member sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) if train_model.params.get('use_xla_jit', False): sess_config.graph_options.optimizer_options.global_jit_level = ( tf.OptimizerOptions.ON_1) # defining necessary hooks hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)] if custom_hooks: for custom_hook in custom_hooks: hooks.append(custom_hook(train_model=train_model, eval_model=eval_model)) if hvd is not None: hooks.append(BroadcastGlobalVariablesHook(0)) if master_worker: checkpoint_dir = train_model.params['logdir'] load_model_dir = train_model.params['load_model'] else: checkpoint_dir = None load_model_dir = None if eval_model is not None: # noinspection PyTypeChecker hooks.append( RunEvaluationHook( every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer), ), ) if master_worker: if train_model.params['save_checkpoint_steps'] is not None: # noinspection PyTypeChecker saver = tf.train.Saver( save_relative_paths=True, max_to_keep=train_model.params['num_checkpoints'] ) hooks.append(tf.train.CheckpointSaverHook( checkpoint_dir, saver=saver, save_steps=train_model.params['save_checkpoint_steps'], )) if train_model.params['print_loss_steps'] is not None: # noinspection PyTypeChecker hooks.append(PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer), )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker hooks.append(PrintSamplesHook( every_steps=train_model.params['print_samples_steps'], model=train_model, )) total_time = 0.0 bench_start = train_model.params.get('bench_start', 10) if debug_port: hooks.append( tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port)) ) if train_model.on_horovod: init_data_layer = train_model.get_data_layer().iterator.initializer else: init_data_layer = tf.group( [train_model.get_data_layer(i).iterator.initializer for i in range(train_model.num_gpus)] ) # We restore only if the user provides load_model_dir. load_model_dir is the # directory containing the checkpoint we want to load partial or all weights # from.. Useful for transer learning or if we do not want to overwrite our # checkpoint. restoring = load_model_dir and not tf.train.latest_checkpoint(checkpoint_dir) if restoring: vars_in_checkpoint = {} for var_name, var_shape in tf.train.list_variables(load_model_dir): vars_in_checkpoint[var_name] = var_shape print('VARS_IN_CHECKPOINT:') print(vars_in_checkpoint) vars_to_load = [] for var in tf.global_variables(): var_name = var.name.split(':')[0] if var_name in vars_in_checkpoint: if var.shape == vars_in_checkpoint[var_name] and 'global_step' not in var_name: vars_to_load.append(var) print('VARS_TO_LOAD:') for var in vars_to_load: print(var) load_model_fn = tf.contrib.framework.assign_from_checkpoint_fn( tf.train.latest_checkpoint(load_model_dir), vars_to_load ) scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer), init_fn = lambda scaffold_self, sess: load_model_fn(sess) ) else: scaffold = tf.train.Scaffold( local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer) ) fetches = [train_model.train_op] try: total_objects = 0.0 # on horovod num_gpus is 1 for worker_id in range(train_model.num_gpus): fetches.append(train_model.get_num_objects_per_step(worker_id)) except NotImplementedError: deco_print("WARNING: Can't compute number of objects per step, since " "train model does not define get_num_objects_per_step method.") # starting training sess = tf.train.MonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks) step = 0 num_bench_updates = 0 while True: if sess.should_stop(): break tm = time.time() try: feed_dict = {} iter_size = train_model.params.get('iter_size', 1) if iter_size > 1: feed_dict[train_model.skip_update_ph] = step % iter_size != 0 if step % iter_size == 0: if step >= bench_start: num_bench_updates += 1 fetches_vals = sess.run(fetches, feed_dict) else: # necessary to skip "no-update" steps when iter_size > 1 def run_with_no_hooks(step_context): return step_context.session.run(fetches, feed_dict) fetches_vals = sess.run_step_fn(run_with_no_hooks) except tf.errors.OutOfRangeError: break if step >= bench_start: total_time += time.time() - tm if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) if train_model.params['print_bench_info_steps'] is not None: if step % train_model.params['print_bench_info_steps'] == 0: total_objects_cur = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: avg_objects = 1.0 * total_objects_cur / total_time deco_print("Avg objects per second: {:.3f}".format(avg_objects)) step += 1 sess.close() if len(fetches) > 1: total_objects = collect_if_horovod(total_objects, hvd, mode="sum") if master_worker: deco_print("Finished training") if step > bench_start: avg_time = 1.0 * total_time / num_bench_updates deco_print("Avg time per step: {:.3f}s".format(avg_time)) if len(fetches) > 1: avg_objects = 1.0 * total_objects / total_time deco_print("Avg objects per second: {:.3f}".format(avg_objects)) else: deco_print("Not enough steps for benchmarking")
def train(train_model, eval_model=None, debug_port=None): if eval_model is not None and 'eval_steps' not in eval_model.params: raise ValueError("eval_steps parameter has to be specified " "if eval_model is provided") hvd = train_model.hvd if hvd: master_worker = hvd.rank() == 0 else: master_worker = True # initializing session parameters sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True if hvd is not None: sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) # defining necessary hooks hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)] if hvd is not None: hooks.append(BroadcastGlobalVariablesHook(0)) if master_worker: checkpoint_dir = train_model.params['logdir'] else: checkpoint_dir = None if eval_model is not None: # noinspection PyTypeChecker hooks.append( RunEvaluationHook( every_steps=eval_model.params['eval_steps'], model=eval_model, last_step=train_model.last_step, ), ) if master_worker: if train_model.params['save_checkpoint_steps'] is not None: # noinspection PyTypeChecker saver = tf.train.Saver(save_relative_paths=True) hooks.append( tf.train.CheckpointSaverHook( checkpoint_dir, saver=saver, save_steps=train_model.params['save_checkpoint_steps']), ) if train_model.params['print_loss_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintLossAndTimeHook( every_steps=train_model.params['print_loss_steps'], model=train_model, )) if train_model.params['print_samples_steps'] is not None: # noinspection PyTypeChecker hooks.append( PrintSamplesHook( every_steps=train_model.params['print_samples_steps'], model=train_model, )) total_time = 0.0 bench_start = train_model.params.get('bench_start', 10) if debug_port: hooks.append( tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port))) if train_model.on_horovod: init_data_layer = train_model.get_data_layer().iterator.initializer else: init_data_layer = tf.group([ train_model.get_data_layer(i).iterator.initializer for i in range(train_model.num_gpus) ]) scaffold = tf.train.Scaffold(local_init_op=tf.group( tf.local_variables_initializer(), init_data_layer)) fetches = [train_model.train_op] try: total_objects = 0.0 # on horovod num_gpus is 1 for worker_id in range(train_model.num_gpus): fetches.append(train_model.get_num_objects_per_step(worker_id)) except NotImplementedError: deco_print( "WARNING: Can't compute number of objects per step, since " "train model does not define get_num_objects_per_step method.") # starting training with tf.train.MonitoredTrainingSession( scaffold=scaffold, checkpoint_dir=checkpoint_dir, save_summaries_steps=train_model.params['save_summaries_steps'], config=sess_config, save_checkpoint_secs=None, log_step_count_steps=train_model.params['save_summaries_steps'], stop_grace_period_secs=300, hooks=hooks, ) as sess: step = 0 while True: if sess.should_stop(): break tm = time.time() try: fetches_vals = sess.run(fetches) except tf.errors.OutOfRangeError: break if step >= bench_start: total_time += time.time() - tm if len(fetches) > 1: for i in range(train_model.num_gpus): total_objects += np.sum(fetches_vals[i + 1]) step += 1 if hvd is not None: deco_print("Finished training on rank {}".format(hvd.rank())) else: deco_print("Finished training") if train_model.on_horovod: ending = " on worker {}".format(hvd.rank()) else: ending = "" if step > bench_start: deco_print("Avg time per step{}: {:.3f}s".format( ending, 1.0 * total_time / (step - bench_start))) if len(fetches) > 1: deco_print("Avg objects per second{}: {:.3f}".format( ending, 1.0 * total_objects / total_time)) else: deco_print("Not enough steps for benchmarking{}".format(ending))