Exemple #1
0
def main(_):
    """Run training and evaluation
    """

    tf.logging.set_verbosity(tf.logging.INFO)

    run_config = tf.estimator.RunConfig()

    hparams = {
        'learning_rate': FLAGS.learning_rate,
        'dropout_rate': 0.4,
        'data_directory': FLAGS.data_directory
    }

    mnist_classifier = tf.estimator.Estimator(model_fn=model.head_model_fn,
                                              model_dir=FLAGS.model_directory,
                                              config=run_config,
                                              params=hparams)

    hooks = []

    if FLAGS.debug_port is not None:
        debug_hook = tf_debug.TensorBoardDebugHook("localhost:{}".format(
            FLAGS.debug_port))
        hooks.append(debug_hook)

    tf.estimator.train_and_evaluate(mnist_classifier, get_train_spec(hooks),
                                    get_eval_spec())
Exemple #2
0
def predict(test_file, model_dir):

    # Load the model
    estimator = tf.estimator.Estimator(model_fn, model_dir=model_dir, params=params)

    # Create the input_fn
    #input_fn = tf.estimator.inputs.numpy_input_fn(x={'image' : inputs}, num_epochs=1, shuffle=False)

    # Prepare hooks for debugging
    hooks = [tf_debug.TensorBoardDebugHook(grpc_debug_server_addresses="dev:6064")]

    predictions = estimator.predict(input_fn=lambda: dataset_input_fn('test'))
    #predictions = estimator.predict(input_fn=lambda: dataset_input_fn('test'), hooks = hooks)

    # Predict!
    predictions_list = []
    predictions_list = list(predictions)

    predicted_label = predictions_list[0]
    print('prediction = {}'.format(predicted_label))
    #print('max = {}'.format(predictions_list[np.argmax(predictions_list)]))

    # Print tensorboard data
    print('tensorboard --logdir=' + str(model_dir) + ' --port 6006 --debugger_port 6064')

    # Visualize predictions based on single test TFrecord
    visualize_pred(test_file, predictions_list, model_dir)
Exemple #3
0
def main(unused_argv):
    classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir="./cnn_model_mel")

    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=50)

    hook = tf_debug.TensorBoardDebugHook("sunny-workstation:7000")

    # test_solution = data_utility.AudioPrepare()
    # train_input_fn = test_solution.tf_input_fn_maker(is_training=True, n_epoch=100)
    # Evaluate the model and print results
    test_solution = data_utility.AudioPrepare()
    test_input_fn = test_solution.tf_input_fn_maker(is_training=False, n_epoch=1)


    # classifier.train(
    #     input_fn=train_input_fn,
    #     steps=20000,
    #     hooks=[logging_hook])

    # eval_results = classifier.evaluate(input_fn=test_input_fn, steps=100)
    # print(eval_results)
    eval_results = classifier.evaluate(input_fn=test_input_fn, steps=3000)
    print(eval_results)
Exemple #4
0
def add_debug_hooks(hooks):
    if FLAGS.debug_tb:
        debug_hook = tf_debug.TensorBoardDebugHook("pawel-workstation:8080")
        hooks.append(debug_hook)
    elif FLAGS.debug_cli:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        hooks.append(debug_hook)
Exemple #5
0
def main(_):
    # Generate some fake Iris data.
    # It is okay for this example because this example is about how to use the
    # debugger, not how to use machine learning to solve the Iris classification
    # problem.
    def training_input_fn():
        return ({
            "features": tf.random_normal([128, 4])
        }, tf.random_uniform([128], minval=0, maxval=3, dtype=tf.int32))

    def test_input_fn():
        return ({
            "features": tf.random_normal([32, 4])
        }, tf.random_uniform([32], minval=0, maxval=3, dtype=tf.int32))

    feature_columns = [
        tf.feature_column.numeric_column("features", shape=(4, ))
    ]

    # Build 3 layer DNN with 10, 20, 10 units respectively.
    model_dir = FLAGS.model_dir or tempfile.mkdtemp(
        prefix="debug_tflearn_iris_")

    classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            model_dir=model_dir)

    if FLAGS.debug and FLAGS.tensorboard_debug_address:
        raise ValueError(
            "The --debug and --tensorboard_debug_address flags are mutually "
            "exclusive.")
    hooks = []
    if FLAGS.debug:
        config_file_path = (tempfile.mktemp(".tfdbg_config")
                            if FLAGS.use_random_config_path else None)
        hooks.append(
            tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                       dump_root=FLAGS.dump_root,
                                       config_file_path=config_file_path))
    elif FLAGS.tensorboard_debug_address:
        hooks.append(
            tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address))

    # Train model, using tfdbg hook.
    classifier.train(training_input_fn, steps=FLAGS.train_steps, hooks=hooks)

    # Evaluate accuracy, using tfdbg hook.
    accuracy_score = classifier.evaluate(test_input_fn,
                                         steps=FLAGS.eval_steps,
                                         hooks=hooks)["accuracy"]

    print("After training %d steps, Accuracy = %f" %
          (FLAGS.train_steps, accuracy_score))

    # Make predictions, using tfdbg hook.
    predict_results = classifier.predict(test_input_fn, hooks=hooks)
    print("A prediction result: %s" % next(predict_results))
Exemple #6
0
def get_hooks(debug_cli, debug_ui):
    hooks = []
    if debug_cli:
        cli_debug_hook = tf_debug.LocalCLIDebugHook()
        cli_debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        hooks.append(cli_debug_hook)
    elif debug_ui:
        debug_host = "{}:5002".format(platform.node())
        hooks.append(tf_debug.TensorBoardDebugHook(debug_host, send_traceback_and_source_code=False))
        print("Debugger is running on {}".format(debug_host))
    return hooks
Exemple #7
0
def main(unused_argv):
  # Load typical and novel datasets
  novel_results = '/home/hannah/src/MastcamCAE/results/DW_udr_12-8-3_7-5-3_nodrop_epochs15'
  typical_results = '/home/hannah/src/MastcamCAE/results/train_udr_12-8-3_7-5-3_nodrop_epochs15'

  typical_data = dataset.load_diff_images(typical_results) # Returns np.array
  typical_labels = np.zeros([typical_data.shape[0],1], dtype=np.int32)
  novel_data = dataset.load_diff_images(novel_results) # Returns np.array
  novel_labels = np.ones([novel_data.shape[0],1], dtype=np.int32)

  # Convert to training and eval sets
  train_data = np.concatenate([typical_data[:98700,:,:,:], novel_data[:300,:,:,:]])
  train_labels = np.concatenate([typical_labels[:98700], novel_labels[:300]])
  eval_data = np.concatenate([typical_data[98700:,:,:,:], novel_data[300:,:,:,:]])
  eval_labels = np.concatenate([typical_labels[98700:], novel_labels[300:]])

  # train_data = np.concatenate([typical_data[:4500,:,:,:], novel_data[:300,:,:,:]])
  # train_labels = np.concatenate([typical_labels[:4500], novel_labels[:300]])
  # eval_data = np.concatenate([typical_data[4500:,:,:,:], novel_data[300:,:,:,:]])
  # eval_labels = np.concatenate([typical_labels[4500:], novel_labels[300:]])

  # Create the Estimator
  multispec_classifier = tf.estimator.Estimator(
      model_fn=cnn_model_fn, model_dir="/home/hannah/src/MastcamCAE/saved_sessions/multispec_convnet_model_nodrop_eps15_udr_60k_seed42")

  # Set up logging for predictions
  # Log the values in the "Softmax" tensor with label "probabilities"
  tensors_to_log = {"probabilities": "softmax_tensor"}
  logging_hook = tf.train.LoggingTensorHook(
      tensors=tensors_to_log, every_n_iter=100)
  debug_hook = tf_debug.TensorBoardDebugHook("ops5.sese.asu.edu:6064")

  # Train the model
  # train_input_fn = tf.estimator.inputs.numpy_input_fn(
  #     x={"x": train_data},
  #     y=train_labels,
  #     batch_size=50,
  #     num_epochs=None,
  #     shuffle=True)
  # multispec_classifier.train(
  #     input_fn=train_input_fn,
  #     steps=20000,
  #     hooks=[logging_hook])

  # Evaluate the model and print results
  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
      x={"x": eval_data},
      y=eval_labels,
      num_epochs=1,
      shuffle=False)
  eval_results = multispec_classifier.evaluate(input_fn=eval_input_fn)
  print(eval_results)
Exemple #8
0
    def hooks(self, mode):
        hooks = []

        if self._debug:
            hooks.append(
                tfdbg.TensorBoardDebugHook("localhost:6007")
            )

        if self._profile_secs is not None:
            hooks.append(
                TensorboardProfilerHook(
                    save_secs=self._log_secs,
                    output_dir=self.path)
            )

        return hooks
Exemple #9
0
def run(job_dir, train_iters, estimator, model_cls, dataset, train_batch_size,
        eval_batch_size, eval_steps, num_parallel_batches, shuffle_buffer_size,
        prefetch_buffer_size, no_eval, debug, debug_address):
    ############
    # Datasets #
    ############
    dataset_train = dataset.read(split='train')

    #######
    # Run #
    #######
    try:
        global_step = estimator.get_variable_value('global_step')
    except ValueError:
        global_step = 1

    tf.logging.info('Start training for %d.', global_step)

    hooks = []

    if not no_eval:
        dataset_test = dataset.read(split='test')
        hooks.append(
            EvaluationRunHook(estimator,
                              build_input_fn(dataset_test,
                                             eval_batch_size,
                                             map_fn=strip_dict_arg(
                                                 model_cls.eval_map_fn),
                                             shuffle_and_repeat=False),
                              eval_steps,
                              summary=False))
    if debug:
        hooks.append(tfdbg.TensorBoardDebugHook(debug_address))

    # Run training for `train_iters` times
    estimator.train(
        build_input_fn(dataset_train,
                       train_batch_size,
                       map_fn=strip_dict_arg(model_cls.map_fn),
                       num_parallel_batches=num_parallel_batches,
                       shuffle_buffer_size=shuffle_buffer_size,
                       prefetch_buffer_size=prefetch_buffer_size,
                       global_step=global_step,
                       shuffle_and_repeat=True),
        max_steps=train_iters,
        # Run evaluation every `eval_steps` iterations
        hooks=hooks)
def main(unused_argv):
    classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                        model_dir="./cnn_model_gfcc")

    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=50)

    hook = tf_debug.TensorBoardDebugHook("sunny-workstation:7000")

    test_solution = data_utility.AudioPrepare()
    # train_input_fn = test_solution.tf_input_fn_maker(is_training=True, n_epoch=100)

    # classifier.train(
    #     input_fn=train_input_fn,
    #     steps=20000,
    #     hooks=[logging_hook])

    # Evaluate the model and print results
    test_solution = data_utility.AudioPrepare()
    test_input_fn = test_solution.tf_input_fn_maker(is_training=False,
                                                    n_epoch=1)
    #
    # eval_results = classifier.evaluate(input_fn=test_input_fn, steps=100)
    # print(eval_results)

    # predict_input_fn=test_solution.tf_input_fn_maker_eval()

    # predictions=classifier.predict(input_fn=predict_input_fn)
    predictions = classifier.predict(input_fn=test_input_fn)

    i = 0
    with open('cnn_gfcc_test.txt', 'w+') as file:
        for var in predictions:
            print(var['classes'])
            file.write(str(var['classes']) + '\n')
            i = i + 1
            # if i==100:
            #     break
    with open('cnn_gfcc_test_pro.txt', 'w+') as file:
        for var in predictions:
            print(var['probabilities'])
            file.write(str(var['probabilities']) + '\n')
            i = i + 1
Exemple #11
0
 def __init__(self, params=None, aux_config=None, run_config=None):
     self._comet_experiment = None
     self._estimator = None
     self.aux_config = aux_config or {}
     self._hooks = (
         []
         if not self.aux_config.get("debug")
         else [tf_debug.LocalCLIDebugHook()]
         if self.aux_config.get("debug") == "cli"
         else [
             tf_debug.TensorBoardDebugHook(
                 "localhost:{}".format(self.aux_config.get("debug"))
             )
         ]
     )
     self.run_config = RunConfig(**(run_config or {}))
     self.params = self.set_params()
     if params:
         self.params.update(params)
Exemple #12
0
def create_hooks(use_tfdbg=False,
                 use_dbgprofile=False,
                 dbgprofile_kwargs=None,
                 use_validation_monitor=False,
                 validation_monitor_kwargs=None,
                 use_early_stopping=False,
                 early_stopping_kwargs=None):
    """Create train and eval hooks for Experiment."""
    train_hooks = []
    eval_hooks = []

    if use_tfdbg:
        #hook = debug.LocalCLIDebugHook()
        hook = debug.TensorBoardDebugHook('127.0.0.1:9990',
                                          send_traceback_and_source_code=False)
        train_hooks.append(hook)
        eval_hooks.append(hook)

    if use_dbgprofile:
        # Recorded traces can be visualized with chrome://tracing/
        # The memory/tensor lifetime is also profiled
        tf.logging.info("Using ProfilerHook")
        defaults = dict(save_steps=10, show_dataflow=True, show_memory=True)
        defaults.update(dbgprofile_kwargs)
        train_hooks.append(tf.train.ProfilerHook(**defaults))

    if use_validation_monitor:
        tf.logging.info("Using ValidationMonitor")
        train_hooks.append(
            tf.contrib.learn.monitors.ValidationMonitor(
                hooks=eval_hooks, **validation_monitor_kwargs))

    if use_early_stopping:
        tf.logging.info("Using EarlyStoppingHook")
        hook = metrics_hook.EarlyStoppingHook(**early_stopping_kwargs)
        # Adding to both training and eval so that eval aborts as well
        train_hooks.append(hook)
        eval_hooks.append(hook)

    return train_hooks, eval_hooks
def train(config, data_dir, my_model_fn=model_fn):
    V, embed_matrix = get_vocab_embedding_matrix(config, data_dir)
    estimator = get_estimator(config, embed_matrix, my_model_fn)

    if config.get('eval.enable', True):
        hooks = [
            get_eval_hook(estimator,
                          lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V)),
                          name='eval',
                          every_n_steps=config.eval.eval_steps),

            # get_eval_hook(estimator,
            #               lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V),
            #                                     num_examples=config.eval.big_num_examples),
            #               name='eval_big',
            #               every_n_steps=config.eval.big_eval_steps),
            #
            # get_eval_hook(estimator,
            #               lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V),
            #                                     file_name='train.tsv', num_examples=config.eval.big_num_examples),
            #               name='train_big',
            #               every_n_steps=config.eval.big_eval_steps),
        ]
    else:
        hooks = []

    if config.get('eval.debug.tensorboard', False):
        hooks += [tf_debug.TensorBoardDebugHook('localhost:%s' % config.get('eval.debug.tensorboard_port', 6068),
                                                send_traceback_and_source_code=False)]

    if config.get('eval.debug.cli', False):
        hooks = [tf_debug.LocalCLIDebugHook()]

    return estimator.train(
        input_fn=lambda: train_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V)),
        hooks=hooks,
        max_steps=config.optim.max_iters
    )
Exemple #14
0
def create_estimator_and_specs(run_config):
  """Creates an Experiment configuration based on the estimator and input fn."""
  model_params = tf.contrib.training.HParams(
      num_layers=FLAGS.num_layers,
      num_nodes=FLAGS.num_nodes,
      batch_size=FLAGS.batch_size,
      num_conv=ast.literal_eval(FLAGS.num_conv),
      conv_len=ast.literal_eval(FLAGS.conv_len),
      num_classes=get_num_classes(),
      learning_rate=FLAGS.learning_rate,
      gradient_clipping_norm=FLAGS.gradient_clipping_norm,
      cell_type=FLAGS.cell_type,
      batch_norm=FLAGS.batch_norm,
      dropout=FLAGS.dropout)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params=model_params)

  debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
  hooks = [debug_hook]

  train_spec = tf.estimator.TrainSpec(input_fn=get_input_fn(
      mode=tf.estimator.ModeKeys.TRAIN,
      tfrecord_pattern=FLAGS.training_data,
      batch_size=FLAGS.batch_size),
      hooks=hooks,
      max_steps=FLAGS.steps)

  eval_spec = tf.estimator.EvalSpec(input_fn=get_input_fn(
      mode=tf.estimator.ModeKeys.EVAL,
      tfrecord_pattern=FLAGS.eval_data,
      batch_size=FLAGS.batch_size))

  return estimator, train_spec, eval_spec
Exemple #15
0
model_path = curr_path.replace('train.py', 'model_for_CRF.py')

# Define the path of your factored out model.py file
#model_file = '/some/path/model_for_CRF.py'
model_file = './model_for_CRF.py'

# Now copy the training script and the model file to
#   model_dir -- the same directory specified when creating the Estimator

# Note: copy over more files if there are other important dependencies.
os.mkdir(model_dir)
shutil.copy(curr_path, model_dir)
shutil.copy(model_path, model_dir)

# Create a LocalCLIDebugHooks and use it as a monitor when calling fit()
hooks = [tf_debug.TensorBoardDebugHook(grpc_debug_server_addresses="dev:6064")]
#hooks = [tf_debug.LocalCLIDebugHook(ui_type="readline")] # Hooks to the manual debugger

# Training/Evaluation Loop
for e in range(params.train_epochs):
    print('Epoch: ' + str(e))
    #estimator.train(input_fn=lambda: dataset_input_fn('train'), hooks=hooks) # RAN
    estimator.train(input_fn=lambda: dataset_input_fn('train'))
    print('### validate ###')
    estimator.evaluate(input_fn=lambda: dataset_input_fn('valid'))

print('tensorboard --logdir=' + str(model_dir))


def make_serving_input_receiver_fn():
    inputs = {
def main():
    # tf.enable_eager_execution()
    #
    # # create fake args for debugging
    # sys.argv = ['']
    # args = parse_arguments()
    # args.batch_size = 10
    # args.max_steps = 5000

    args = parse_arguments()

    graph_data = load_data_graphsage(args.data_dir)


    """
    Sample defined as a custom tf dataset
    """
    sample_train = make_sample(args.sampler, args)  # sample subgraph according to graph sampling scheme args.sampler
    input_fn = augment_sample(graph_data, args, sample_train)  # augment subgraph with vertex labels and features

    """
    Predictor class and loss function
    """
    # hyperparams
    vertex_embedding_params = {
        'embedding_dim': args.embedding_dim,
        'embedding_trainable': True,
        'embedding_checkpoint': None
    }

    params={
        **vertex_embedding_params,
        'hidden_units' : [200, 200], # Jaan net
        'n_classes':  max(graph_data.classes)+1,
        'num_vertices': graph_data.num_vertices,
        'batch_size': args.batch_size
    }

    classifier_predictor_and_loss = make_nn_class_predictor(
        label_task_weight=args.label_task_weight,
        regularization=_adjust_regularization(args.global_regularization, args.batch_size),
        global_optimizer=_make_global_optimizer(args),
        embedding_optimizer=lambda: tf.train.GradientDescentOptimizer(
            _adjust_learning_rate(args.embedding_learning_rate, args.batch_size))
    )

    node_classifier = tf.estimator.Estimator(
        model_fn=classifier_predictor_and_loss,
        params=params,
        model_dir=args.train_dir)


    """
    Put it together for the optimization
    """
    # some extra logging
    hooks = [
        tf.train.LoggingTensorHook({
            'kappa_edges': 'kappa_edges_in_batch/value'},
            every_n_iter=100)
    ]

    if args.profile:
        hooks.append(tf.train.ProfilerHook(save_secs=30))

    if args.debug:
        from tensorflow.python import debug as tfdbg
        hooks.append(tfdbg.TensorBoardDebugHook('localhost:6004'))

    node_classifier.train(
        input_fn=input_fn,
        max_steps=args.max_steps,
        hooks=hooks)


    """
    Evaluate
    """
    node_classifier.evaluate(input_fn=augment_sample(graph_data, args, sample_train, 2000),
                             name="node2vec_eval")
Exemple #17
0
def main():
    tf.logging.set_verbosity(tf.logging.INFO)

    # Path to file specifying all runtime and model parameters and how to process user command line input.
    config_file_path = os.path.join(PROJECT_MODEL_ROOT, "configs/default.json")

    # Argparse namespace combining json defaults and user command line inputs
    args = estimator_utils.init_basic_argument_parser(config_file_path)
    # Transfer all k:v pairs from the Argparse namespace to HParams
    hparams = tf.contrib.training.HParams(**vars(args))
    # Print stats about the current run
    print_run_info(args)

    # Calculate the number of steps needed to complete one epoch for each of the subsets
    steps_in_epoch_train = np.ceil(args.num_samples["train"] /
                                   args.train_batch_size)
    steps_in_epoch_val = np.ceil(args.num_samples["validation"] /
                                 args.validation_batch_size)

    # Number of training steps to perform during train_and_evaluate
    total_train_steps = int(steps_in_epoch_train * args.num_epochs)
    # Minimum number of steps during which no early stopping can occur
    train_steps_without_stopping = steps_in_epoch_train * args.train_epochs_without_stopping
    # Number of steps during which no metric improvement happened that is needed to initiate early stopping
    max_train_steps_without_improvement = int(
        steps_in_epoch_train * args.max_train_epochs_without_improvement)
    # Number of evaluation steps that are performed during each of the calls to evaluation during train_and_evaluate
    eval_steps_during_train = int(steps_in_epoch_val *
                                  args.eval_pc_during_train)
    # Number of steps during which evaluation is not performed
    train_steps_without_evaluation = int(steps_in_epoch_train *
                                         args.delay_evaluation_epochs)

    throttle_secs = args.save_checkpoints_secs
    save_checkpoints_steps = None
    # Only one of secs and steps for checkpointing frequency is allowed to be saved
    assert (args.save_checkpoints_secs
            is not None) ^ (args.checkpoint_freq_epochs is not None)
    if args.checkpoint_freq_epochs is not None:
        save_checkpoints_steps = np.ceil(
            steps_in_epoch_train *
            args.checkpoint_freq_epochs)  # TODO Ensure this is never zero
        throttle_secs = 1

    # Number of towers
    num_shards = args.num_gpu if args.num_gpu > 0 else 1

    # Path object pointing to the location where the checkpoints and results are saved
    # If model path is provided then load a previously instantiated model and train/evaluate
    # using the previous values.

    folder_naming_vars = []
    for x in args.folder_naming_vars:
        folder_naming_vars.append(
            eval(x))  # For some reason list comprehension doesn't work

    execution_date = time.strftime("%Y%b%d", time.localtime(
    )) if args.execution_date is None else args.execution_date

    # Sagemaker provides model_dir or when running elsewhere creates new model_dir or loads previous run via model_path
    if hparams.model_dir is None:
        model_dir = retrieve_model_dir(args.log_dir_path, args.model_path,
                                       execution_date, *folder_naming_vars)
        hparams.set_hparam("model_dir", model_dir)
        setattr(args, "model_dir", model_dir)

    # Path pointing to the location of the current data set (e.g. .../numpy/lastfm_10_pc)
    data_dir = os.path.join(
        args.data_dir_path if args.data_dir_path else "",
        "" if args.exec_loc == "sagemaker" else args.dataset,
        "tfrecords" if args.input_data_format == "tfrecords" else "",
        "sharded" if args.exec_loc == "sagemaker" else "")

    # Tensorflow device allocation settings
    config_proto = tf.ConfigProto(
        allow_soft_placement=args.allow_soft_placement,
        log_device_placement=args.log_device_placement)
    config_proto.gpu_options.allow_growth = True

    # Object specifying current run settings e.g. logging frequency and num of check points saved.
    run_config = tf.estimator.RunConfig(
        tf_random_seed=args.tf_random_seed,
        model_dir=args.model_dir,
        session_config=config_proto,
        save_summary_steps=20,
        save_checkpoints_steps=save_checkpoints_steps
        if not args.overwrite else 1,
        save_checkpoints_secs=args.save_checkpoints_secs,
        keep_checkpoint_max=args.keep_checkpoint_max,
        log_step_count_steps=100,
    )

    # Instantiate an Estimator object with the model_fn from this module.
    estimator = estimator_model.create_estimator(run_config, hparams)

    # The degree of shuffling - int. Check tf.Data.dataset.shuffle() for additional documentation.
    shuffle_train = int(args.num_samples["train"] *
                        args.shuffle_train) if args.shuffle_train else 1
    shuffle_val = int(args.num_samples["val"] *
                      args.shuffle_test) if args.shuffle_test else 1

    additional_arrays = ["weights"] if args.use_weights else []

    # https://cloud.google.com/blog/products/gcp/easy-distributed-training-with-tensorflow-using-tfestimatortrain-and-evaluate-on-cloud-ml-engine
    with tf.name_scope("TrainSpec_and_hook"):
        with tf.name_scope("Early_stop_hook"):
            try:
                os.makedirs(estimator.eval_dir())
            except FileExistsError:
                pass

            training_hooks = []

            early_stopping_hook = estimator_utils.make_early_stopping_hook(
                estimator=estimator,
                metric_name=args.key_metrics[0],
                max_train_steps_without_improvement=
                max_train_steps_without_improvement,
                min_steps=train_steps_without_stopping,
                run_every_secs=None,
                run_every_steps=1)
            if args.early_stopping:
                training_hooks.append(early_stopping_hook)

            # from https://stackoverflow.com/questions/45719176/how-to-display-runtime-statistics-in-tensorboard-using-estimator-api-in-a-distri
            if args.metadata_hook_saving_frequency:
                runtime_stats_hook = estimator_utils.MetadataHook(
                    save_secs=args.metadata_hook_saving_frequency,
                    output_dir=str(args.model_dir))
                training_hooks.append(runtime_stats_hook)

            if args.profiler_hook:
                profiler_hook = tf.train.ProfilerHook(
                    save_steps=10,
                    save_secs=None,
                    output_dir=str(os.path.join(args.model_dir, "timelines")),
                    show_memory=True)
                training_hooks.append(profiler_hook)

            # Debugging
            if args.tensorboard_debug_address:
                debug_hook = tf_debug.TensorBoardDebugHook(
                    args.tensorboard_debug_address)
                training_hooks.append(debug_hook)
            if args.debug:
                debug_hook = tf_debug.LocalCLIDebugHook()
                training_hooks.append(debug_hook)
            if args.debug:
                debug_hook = tf_debug.DumpingDebugHook(args.debug_dump_path)
                training_hooks.append(debug_hook)

        with tf.name_scope("TrainSpec"):
            train_spec = tf.estimator.TrainSpec(
                input_fn=lambda: estimator_model.input_fn(
                    data_dir=data_dir,
                    subset="train",
                    num_shards=num_shards,
                    batch_size=args.train_batch_size,
                    X_cols_to_use=args.X_cols_to_use,
                    input_data_format=args.input_data_format,
                    shuffle=shuffle_train,
                    additional_arrays=additional_arrays,
                    delta_t_mean=args.delta_t_mean,
                    delta_t_std=args.delta_t_std),
                max_steps=total_train_steps if not args.overwrite else 10,
                hooks=training_hooks)

    with tf.name_scope("EvalSpec_and_exporter"):
        with tf.name_scope("Exporter"):
            # TODO Define function to process the input e.g. seq for the whole user - this function used to simulate real data
            exporters = []
            for key_metric in args.key_metrics:
                exporters.append(
                    tf.estimator.BestExporter(
                        name=key_metric,
                        serving_input_receiver_fn=estimator_model.
                        serving_input_fn(args),
                        compare_fn=estimator_checkpointing.
                        custom_checkpoint_compare_fn(default_key=key_metric),
                        exports_to_keep=1,
                        as_text=False))

        with tf.name_scope("EvalSpec"):
            eval_spec = tf.estimator.EvalSpec(
                input_fn=lambda: estimator_model.input_fn(
                    data_dir=data_dir,
                    subset="validation",
                    num_shards=num_shards,
                    batch_size=args.validation_batch_size,
                    X_cols_to_use=args.X_cols_to_use,
                    input_data_format=args.input_data_format,
                    shuffle=shuffle_val,
                    additional_arrays=additional_arrays,
                    delta_t_mean=args.delta_t_mean,
                    delta_t_std=args.delta_t_std),
                exporters=exporters if args.use_exporter else None,  #TODO
                steps=eval_steps_during_train if not args.overwrite else 1,
                throttle_secs=throttle_secs,
                start_delay_secs=args.start_delay_secs)

    if train_steps_without_evaluation > 0:
        print(
            "Starting preliminary training for {} steps during which no evaluation is performed."
            .format(train_steps_without_evaluation))
        estimator.train(input_fn=lambda: estimator_model.input_fn(
            data_dir=data_dir,
            subset="train",
            num_shards=num_shards,
            batch_size=args.train_batch_size,
            X_cols_to_use=args.X_cols_to_use,
            input_data_format=args.input_data_format,
            shuffle=shuffle_train,
            additional_arrays=additional_arrays,
            delta_t_mean=args.delta_t_mean,
            delta_t_std=args.delta_t_std),
                        max_steps=train_steps_without_evaluation
                        if not args.overwrite else 10,
                        hooks=training_hooks)
        # Export the model for the offchance that the metrics for validation don't improve after the first run
        # when I believe no export is performed
        export_dir = os.path.join(args.model_dir, "export",
                                  args.key_metrics[0])
        estimator.export_savedmodel(export_dir,
                                    estimator_model.serving_input_fn(args),
                                    strip_default_attrs=True)

    print(
        "Starting Train and Evaluate for {} training steps with Evaluation every {} second(s) or {} steps for {} evaluation steps."
        .format(total_train_steps, throttle_secs, save_checkpoints_steps,
                eval_steps_during_train))

    with tf.name_scope("Train_and_Evaluate"):
        tf.estimator.train_and_evaluate(estimator=estimator,
                                        train_spec=train_spec,
                                        eval_spec=eval_spec)
    if args.exec_loc == "sagemaker":
        updated_model_path = estimator_sagemaker.sagemaker_postprocessing(args)
        predictor_param_names = [
            "predictor_s3_input_path", "predictor_s3_output_path",
            "predictor_batch_size"
        ]
        predictor_params = [getattr(args, x) for x in predictor_param_names]
        if np.all([x is not None for x in predictor_params]):
            estimator_sagemaker.predict_s3_numpy(
                saved_model_path=updated_model_path,
                input_s3_path=args.predictor_s3_input_path,
                output_s3_path=args.predictor_s3_output_path,
                batch_size=args.predictor_batch_size)
    else:

        # Evaluate trained model
        steps_in_epoch_test = np.ceil(args.num_samples["test"] /
                                      args.validation_batch_size)
        shuffle_test = args.num_samples["train"] if args.shuffle_test else 1

        with tf.name_scope("Evaluate_trained_model"):

            train_input_fn = lambda: estimator_model.input_fn(
                data_dir=data_dir,
                subset="train",
                num_shards=
                num_shards,  #Switch to one and adjust bs/num_gpu for single device
                batch_size=args.
                train_batch_size,  #TODO Does that work for serving
                X_cols_to_use=args.X_cols_to_use,
                input_data_format=args.input_data_format,
                shuffle=shuffle_train,
                additional_arrays=additional_arrays,
                delta_t_mean=args.delta_t_mean,
                delta_t_std=args.delta_t_std)

            test_input_fn = lambda: estimator_model.input_fn(
                data_dir=data_dir,
                subset="test",
                num_shards=num_shards,
                batch_size=args.validation_batch_size,
                X_cols_to_use=args.X_cols_to_use,
                input_data_format=args.input_data_format,
                shuffle=shuffle_test,
                additional_arrays=additional_arrays,
                delta_t_mean=args.delta_t_mean,
                delta_t_std=args.delta_t_std)

            if not args.final_eval_multiple_models:

                # Find best checkpoint and its associated metrics
                best_checkpoint_path, best_checkpoint_metrics = estimator_checkpointing.best_checkpoint(
                    model_dir=args.model_dir,
                    eval_dir=estimator.eval_dir(),
                    metric=args.key_metrics[0])
                print("Best checkpoint: {}".format(best_checkpoint_path))
                print("Best metrics: {}".format(best_checkpoint_metrics))

                # Remove model_dir from previous run_config as that causes evaluation to ignore warm_start_from
                eval_run_config = deepcopy(run_config)
                setattr(eval_run_config, "_model_dir", None)

                # New estimator restarted with best result for user-specified metric
                estimator = estimator_model.create_estimator(
                    eval_run_config,
                    hparams,
                    warm_start_from=best_checkpoint_path)

                train_results = estimator.evaluate(input_fn=train_input_fn,
                                                   steps=steps_in_epoch_train)
                print("Final evaluation on train subset: {}".format(
                    train_results))

                test_results = estimator.evaluate(input_fn=test_input_fn,
                                                  steps=steps_in_epoch_test)
                print(
                    "Final evaluation on test subset: {}".format(test_results))

            else:
                estimator_checkpointing.evaluate_multiple_checkpoints(
                    model_dir=args.model_dir,
                    eval_dir=estimator.eval_dir(),
                    num_checkpoints=args.keep_checkpoint_max,
                    metric=args.key_metrics[0],
                    input_fn=test_input_fn,
                    run_config=run_config,
                    hparams=hparams,
                    num_steps_in_eval=steps_in_epoch_test
                    if not args.overwrite else 1)

        if args.clear_checkpoints:

            rm_graph_command = "for f in $(find {} -name 'graph.pbtxt'); do rm $f; done".format(
                str(model_dir))
            rm_checkpoints_command = "for f in $(find {} -name 'model.ckpt-*'); do rm $f; done".format(
                str(model_dir))

            process = subprocess.run(rm_graph_command, shell=True, check=True)
            process = subprocess.run(rm_checkpoints_command,
                                     shell=True,
                                     check=True)

            print("Cleared model_dir: {}".format(str(model_dir)))
Exemple #18
0
def main(_):
    # Load datasets.
    if FLAGS.fake_data:

        def training_input_fn():
            return ({
                "features": tf.random_normal([128, 4])
            }, tf.random_uniform([128], minval=0, maxval=3, dtype=tf.int32))

        def test_input_fn():
            return ({
                "features": tf.random_normal([32, 4])
            }, tf.random_uniform([32], minval=0, maxval=3, dtype=tf.int32))

        feature_columns = [
            tf.feature_column.numeric_column("features", shape=(4, ))
        ]
    else:
        training_data_path, test_data_path = maybe_download_data(
            FLAGS.data_dir)
        column_names = [
            "sepal_length", "sepal_width", "petal_length", "petal_width",
            "label"
        ]
        batch_size = 32

        def training_input_fn():
            return tf.contrib.data.make_csv_dataset([training_data_path],
                                                    batch_size,
                                                    column_names=column_names,
                                                    label_name="label")

        def test_input_fn():
            return tf.contrib.data.make_csv_dataset([test_data_path],
                                                    batch_size,
                                                    column_names=column_names,
                                                    label_name="label")

        feature_columns = [
            tf.feature_column.numeric_column(feature)
            for feature in column_names[:-1]
        ]

    # Build 3 layer DNN with 10, 20, 10 units respectively.
    model_dir = FLAGS.model_dir or tempfile.mkdtemp(
        prefix="debug_tflearn_iris_")

    classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[10, 20, 10],
                                            n_classes=3,
                                            model_dir=model_dir)

    if FLAGS.debug and FLAGS.tensorboard_debug_address:
        raise ValueError(
            "The --debug and --tensorboard_debug_address flags are mutually "
            "exclusive.")
    hooks = []
    if FLAGS.debug:
        hooks.append(
            tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type,
                                       dump_root=FLAGS.dump_root))
    elif FLAGS.tensorboard_debug_address:
        hooks.append(
            tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address))

    # Train model, using tfdbg hook.
    classifier.train(training_input_fn, steps=FLAGS.train_steps, hooks=hooks)

    # Evaluate accuracy, using tfdbg hook.
    accuracy_score = classifier.evaluate(test_input_fn,
                                         steps=FLAGS.eval_steps,
                                         hooks=hooks)["accuracy"]

    print("After training %d steps, Accuracy = %f" %
          (FLAGS.train_steps, accuracy_score))

    # Make predictions, using tfdbg hook.
    predict_results = classifier.predict(test_input_fn, hooks=hooks)
    print("A prediction result: %s" % next(predict_results))
#treshold on what messages are to be logged
tf.logging.set_verbosity(tf.logging.INFO)
#importing debug library
from tensorflow.python import debug as tf_debug

# ## Debugger
#
# ### Uncomment the below line and execute the code to run the debugger.
#
# ### Go to the link once you start execution    			http://localhost:6006/

# In[2]:

#Uncomment the below line to run the debugger
#Add monitor=[hook] as a parameter to the estimators below
hook = tf_debug.TensorBoardDebugHook("localhost:6064",
                                     send_traceback_and_source_code=False)

#hook = tf_debug.LocalCLIDebugHook()
# In[3]:


def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
Exemple #20
0
def train(train_model, eval_model=None, debug_port=None):
  if eval_model is not None and 'eval_steps' not in eval_model.params:
    raise ValueError("eval_steps parameter has to be specified "
                     "if eval_model is provided")
  hvd = train_model.hvd
  if hvd:
    master_worker = hvd.rank() == 0
  else:
    master_worker = True

  # initializing session parameters
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  # pylint: disable=no-member
  sess_config.gpu_options.allow_growth = True
  if hvd is not None:
    # pylint: disable=no-member
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

  # defining necessary hooks
  hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)]
  if hvd is not None:
    hooks.append(BroadcastGlobalVariablesHook(0))

  if master_worker:
    checkpoint_dir = train_model.params['logdir']
    base_ckpt_dir = train_model.params['load_model']
  else:
    checkpoint_dir = None
    base_ckpt_dir = None

  if eval_model is not None:
    # noinspection PyTypeChecker
    hooks.append(
        RunEvaluationHook(
            every_steps=eval_model.params['eval_steps'],
            model=eval_model,
            last_step=train_model.last_step,
            print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer),
        ),
    )

  if master_worker:
    if train_model.params['save_checkpoint_steps'] is not None:
      # noinspection PyTypeChecker
      saver = tf.train.Saver(save_relative_paths=True)
      hooks.append(tf.train.CheckpointSaverHook(
          checkpoint_dir,
          saver=saver,
          save_steps=train_model.params['save_checkpoint_steps'],
      ))
    if train_model.params['print_loss_steps'] is not None:
      # noinspection PyTypeChecker
      hooks.append(PrintLossAndTimeHook(
          every_steps=train_model.params['print_loss_steps'],
          model=train_model,
          print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer),
      ))
    if train_model.params['print_samples_steps'] is not None:
      # noinspection PyTypeChecker
      hooks.append(PrintSamplesHook(
          every_steps=train_model.params['print_samples_steps'],
          model=train_model,
      ))

  total_time = 0.0
  bench_start = train_model.params.get('bench_start', 10)

  if debug_port:
    hooks.append(
        tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port))
    )

  if train_model.on_horovod:
    init_data_layer = train_model.get_data_layer().iterator.initializer
  else:
    init_data_layer = tf.group(
        [train_model.get_data_layer(i).iterator.initializer
         for i in range(train_model.num_gpus)]
    )
  
  fine_tuning = (not base_ckpt_dir) or tf.train.latest_checkpoint(checkpoint_dir)
  if fine_tuning:   
    scaffold = tf.train.Scaffold(
        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
    )
  else:
    scaffold = TransferScaffold(
        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
    )
  fetches = [train_model.train_op]
  try:
    total_objects = 0.0
    # on horovod num_gpus is 1
    for worker_id in range(train_model.num_gpus):
      fetches.append(train_model.get_num_objects_per_step(worker_id))
  except NotImplementedError:
    deco_print("WARNING: Can't compute number of objects per step, since "
               "train model does not define get_num_objects_per_step method.")

  # starting training
  if fine_tuning:
    sess = TransferMonitoredTrainingSession(
      scaffold=scaffold,
      checkpoint_dir=checkpoint_dir,
      save_summaries_steps=train_model.params['save_summaries_steps'],
      config=sess_config,
      save_checkpoint_secs=None,
      log_step_count_steps=train_model.params['save_summaries_steps'],
      stop_grace_period_secs=300,
      hooks=hooks,
      base_ckpt_dir=base_ckpt_dir,
      load_fc=train_model.params['load_fc'])
  else:
    sess = tf.train.MonitoredTrainingSession(
      scaffold=scaffold,
      checkpoint_dir=checkpoint_dir,
      save_summaries_steps=train_model.params['save_summaries_steps'],
      config=sess_config,
      save_checkpoint_secs=None,
      log_step_count_steps=train_model.params['save_summaries_steps'],
      stop_grace_period_secs=300,
      hooks=hooks)
  step = 0
  num_bench_updates = 0
  while True:
    if sess.should_stop():
      break
    tm = time.time()
    try:
      feed_dict = {}
      iter_size = train_model.params.get('iter_size', 1)
      if iter_size > 1:
        feed_dict[train_model.skip_update_ph] = step % iter_size != 0
      if step % iter_size == 0:
        if step >= bench_start:
          num_bench_updates += 1
        fetches_vals = sess.run(fetches, feed_dict)
      else:
        # necessary to skip "no-update" steps when iter_size > 1
        def run_with_no_hooks(step_context):
          return step_context.session.run(fetches, feed_dict)
        fetches_vals = sess.run_step_fn(run_with_no_hooks)
    except tf.errors.OutOfRangeError:
      break
    if step >= bench_start:
      total_time += time.time() - tm
      if len(fetches) > 1:
        for i in range(train_model.num_gpus):
          total_objects += np.sum(fetches_vals[i + 1])
        if train_model.params['print_bench_info_steps'] is not None:
          if step % train_model.params['print_bench_info_steps'] == 0:
            total_objects_cur = collect_if_horovod(total_objects, hvd,
                                                   mode="sum")
            if master_worker:
              avg_objects = 1.0 * total_objects_cur / total_time
              deco_print("Avg objects per second: {:.3f}".format(avg_objects))

    step += 1
  sess.close()

  if len(fetches) > 1:
    total_objects = collect_if_horovod(total_objects, hvd, mode="sum")

  if master_worker:
    deco_print("Finished training")
    if step > bench_start:
      avg_time = 1.0 * total_time / num_bench_updates
      deco_print("Avg time per step: {:.3f}s".format(avg_time))
      if len(fetches) > 1:
        avg_objects = 1.0 * total_objects / total_time
        deco_print("Avg objects per second: {:.3f}".format(avg_objects))
    else:
      deco_print("Not enough steps for benchmarking")
Exemple #21
0
def main(argv):
	args = parser.parse_args(argv[1:])

	# handling commandline parameters
	logging.info("Cmdline Input: {}".format(argv))
	TRAINING = args.training
	WITHPLOT = args.plot
	singleData = args.single
	FAKE = args.fake
	numberPrint = args.plotNo
	hyperParamFile = args.hyperparams
	saving = args.save
	loading = args.load
	augment = args.augment
	filterBool = args.filter
	overrideModelPath = args.overrideModel
	overrideInputPath = args.overrideInput
	usingCustomEstimator = args.custom

	displayWeights = args.dispWeights
	DEBUG = args.debug
	tensorboardDebugAddress = args.tensorboard_debug_address
	progressPlot = args.progressPlot

	maximumLossAnalysis = args.lossAna
	cancelThreshold = args.target

	# Commandline parameters sanity checks
	saveLoc = None
	if args.save is not None and args.load is not None:
		raise ValueError(
			"The --load and --save flags are mutually exclusive.")

	if args.save is not None and len(args.save) not in (0, 1):
		parser.error('Either give no values for save, or two, not {}.'.format(len(args.save)))
	elif args.save is not None:
		if len(args.save) == 0:
			# save to default location
			saveLoc = None
		elif len(args.save) == 1:
			# custom save location
			saveLoc = args.save[0]

	loadLoc = None
	if args.load is not None and len(args.load) not in (0, 1):
		parser.error('Either give no values for load, or one, not {}.'.format(len(args.load)))
		sys.exit(-1)
	elif args.load is not None:
		if len(args.load) == 0:
			# save to default location
			loadLoc = None
		elif len(args.load) == 1:
			# custom save location
			loadLoc = args.load[0]

	if args.separator is not None and FAKE:
		parser.error('No fake data for separator training (yet)')

	if args.separator is not None and len(args.separator) not in (0, 2):
		parser.error('Separator needs 2 Integers representing prediction Close off and separator position: given {}'.format(len(args.separator)))
	elif args.separator is not None:
		separator = True
		if len(args.separator) == 0:
			separatorPosition = 1550
			predictionCutOff = 1300
		else:
			separatorPosition = args.separator[0]
			predictionCutOff = args.separator[1]
	else:
		separator = False

	if cancelThreshold is not None and not TRAINING:
		logging.warning("target parameter is not useful when not in training")


	time_stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H.%M.%S')

	# load hyperparameters from hyperparameter file
	try:
		hyper_params = load_params(hyperParamFile)

		STEPS_PER_EPOCH = hyper_params.train.steps_per_epoch
		EPOCHS = hyper_params.train.epochs
		BATCH_SIZE = hyper_params.train.batch_size

		FEATURE_SIZE = hyper_params.arch.feature_size
		ACTIVATION = hyper_params.arch.activation # "leaky_relu", "relu", "linear", TODO: "sigmoid", "tanh"
		dropout = hyper_params.arch.dropout_rate
		hidden_layers = hyper_params.arch.hidden_layers
		regularization = hyper_params.arch.regularization

		if regularization is None or regularization.lower() == "no":
			l1regularization = False
			l2regularization = False
		elif regularization.lower() == "l1":
			l1regularization = True
			l2regularization = False
		elif regularization.lower() == "l2":
			l1regularization = False
			l2regularization = True
		else:
			raise AttributeError('invalid string in hyper_params.arch.regularization')

		if FAKE:
			FAKE_DATA_AMOUNT = hyper_params.data.numberFakeLines
		if augment:
			MIDPOINT = hyper_params.data.augmentMidpoint
			MIRRORRANGE = hyper_params.data.augmentRange
		testSize = hyper_params.data.testSize
		limits = hyper_params.data.limits

		elementsDirection = hyper_params.data.direction
		if elementsDirection.lower() == "y":
			elementsDirectionBool = True
		elif elementsDirection.lower() == "x":
			elementsDirectionBool = False

		unitLocDirection = hyper_params.data.unitLoc
		unitTimeDirection = hyper_params.data.unitTime
		units = {'loc': unitLocDirection, 'time':unitTimeDirection}

		optimizer = hyper_params.train.optimizer # "Adam", "Adagrad"
		learningRate = hyper_params.train.learning_rate
		decaySteps = hyper_params.train.decay_steps

		if overrideInputPath is None:
			dataFolder = hyper_params.problem.data_path
		else:
			dataFolder = overrideInputPath

		baseModelPath = hyper_params.problem.modelBasePath
		baseImagePath = hyper_params.problem.imagePath
		if args.separator is None:
			if hyper_params.problem.separator == 1:
				separator = True
				separatorPosition = hyper_params.problem.separatorPosition
				predictionCutOff = hyper_params.problem.predictionCutOff
				thresholdPoint = hyper_params.problem.thresholdPoint
			else:
				separator = False

	except AttributeError as err:
		logging.error("Error in Parameters. Maybe mistake in hyperparameter file?")
		logging.error("AttributeError: {0}".format(err))
		sys.exit(1)
	except Exception as e:
		logging.error("Some kind of error? not sure: {}".format(e))
		sys.exit(1)



	if loading is None:
		# Generate feature-label-pairs from given csv track files based on given parameters
		if not FAKE and not separator:
			(F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasNextStep(dataFolder, FEATURE_SIZE, testSize)
		elif separator:
			(F_train, L_train), (F_test, L_test), (labelMeans, labelStds) = ld.loadRawMeasSeparation(dataFolder, FEATURE_SIZE, testSize,
																			separatorPosition, predictionCutOff,
																			elementsDirectionBool)
			if filterBool:
				F_train = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool)
				F_test = filterDataForIntersection(F_test, thresholdPoint, elementsDirectionBool)
				L_train = L_train.loc[F_train.index]
				L_test = L_test.loc[F_test.index]

		else:
			(F_train, L_train), (F_test, L_test) = ld.loadFakeDataPandas(FEATURE_SIZE, FAKE_DATA_AMOUNT, testSize)

		# TODO: ziemlich unschön - das könnte man noch besser machen
		if singleData:
			F_train = pd.concat([F_train, F_test])
			F_test = F_train
			L_train = pd.concat([L_train, L_test])
			L_test = L_train

		# ExTODO: find Augmentation MIDPOINT from data or as argument? - from Argument
		# Applying augmentation to feature-label-pairs
		if augment:
			logging.info("applying augmentation to Training Set...")
			if separator:
				F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool)
			else:
				F_train, L_train = augmentData(F_train, L_train, MIDPOINT, MIRRORRANGE, separator, labelMeans, labelStds, direction=elementsDirectionBool)
			state = random.randint(1, 101)
			F_train = F_train.sample(frac=1, random_state=state)
			L_train = L_train.sample(frac=1, random_state=state)
			logging.info("done!")

	# Network Design
	# --------------

	my_feature_columns = []
	columnNames = ld.genColumnNames(FEATURE_SIZE)
	for key in columnNames:
		my_feature_columns.append(tf.feature_column.numeric_column(key=key))

	if not overrideModelPath:
		MODEL_PATH = baseModelPath  # genModelPath(hyper_params, FAKE, usingCustomEstimator, separator)
	else:
		MODEL_PATH = overrideModelPath

	logging.info("time: {}".format(time_stamp))
	logging.info('Saving to %s' % MODEL_PATH)

	# Preparing the initialisation of the estimator
	if optimizer == 'Adagrad':
		opti = tf.train.AdagradOptimizer
	elif optimizer == 'Adam':
		opti = tf.train.AdamOptimizer
	# elif optimizer == 'GradientDescent':
	# 	opti = tf.train.GradientDescentOptimizer
	else:
		logging.error("No (or wrong) optimizer given in hyperparameter file")
		sys.exit(-1)

	if ACTIVATION == 'relu':
		acti = tf.nn.relu
	elif ACTIVATION == 'leaky_relu':
		acti = tf.nn.leaky_relu
	elif ACTIVATION == 'linear':
		acti = None
	else:
		logging.error("No (or wrong) activation function given in hyperparameter file")
		sys.exit(-1)

	# File System preparation: check if right folders exist and create them if they dont
	if not os.path.exists(MODEL_PATH):
		os.makedirs(MODEL_PATH)
		logging.info("model folder {} does not exist. Creating folder".format(MODEL_PATH))
	elif os.path.exists(MODEL_PATH) and not os.path.isdir(MODEL_PATH):
		logging.error("There is a file in the place where one would like to save their files..")
		sys.exit(1)

	if not os.path.exists(baseImagePath):
		os.makedirs(baseImagePath)
		logging.info("image folder: {} does not exist. Creating folder".format(MODEL_PATH))

	if not os.path.exists(MODEL_PATH + '/' + os.path.basename(hyperParamFile)):
		shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(MODEL_PATH + hyperParamFile))
		# print("new hyperParam File written")
	else:
		shutil.copy2(hyperParamFile, MODEL_PATH + '/' + os.path.basename(hyperParamFile)[:-5] + time_stamp + ".json")
		# print("added another version of hyper param file")

	# Saving the generated feature-label-pairs for future use
	if saving is not None:
		logging.info("storing data in {}".format(saveLoc))

		if saveLoc is None:
			saveLoc = MODEL_PATH + '/data.h5'

		with pd.HDFStore(saveLoc) as store:
			store['xtrain'] = F_train
			store['ytrain'] = L_train

			store['xtest'] = F_test
			store['ytest'] = L_test

			store['labelMeans'] = labelMeans
			store['labelStds'] = labelStds

	# loading a set of pregenerated feature-label-pairs for usage
	if loading is not None:
		try:
			if loadLoc is None:
				loadLoc = MODEL_PATH + '/data.h5'

			logging.info("loading data from {}.".format(loadLoc))

			with pd.HDFStore(loadLoc) as store:
				F_train = store['xtrain']
				L_train = store['ytrain']

				F_test = store['xtest']
				L_test = store['ytest']

				labelMeans = store['labelMeans']
				labelStds = store['labelStds']

		except Exception as e:
			logging.error("Error while loading from stored data: {}".format(e))
			sys.exit(1)

	assert not F_train.index.duplicated().any()
	assert not L_train.index.duplicated().any()
	assert not F_test.index.duplicated().any()
	assert not L_test.index.duplicated().any()

	# Plot progress Vars - more or less deprecated, but could be updated for current state
	if progressPlot:
		pos = [int(i * EPOCHS/10) for i in range(1, 10)]
		debugVisualizerIndex = random.randint(1, F_test.shape[0])
		featureVals = F_test.iloc[[debugVisualizerIndex]]
		labelVals = L_test.iloc[[debugVisualizerIndex]]
		predictions = []

	if not usingCustomEstimator:
		# Validation and Test Configuration
		logging.info("using premade Estimator")
		test_config = estimator.RunConfig(save_checkpoints_steps=50000,
										  save_checkpoints_secs=None, save_summary_steps=100)

		regressor = estimator.DNNRegressor(feature_columns=my_feature_columns,
										   label_dimension=2,
										   hidden_units=hidden_layers,
										   model_dir=MODEL_PATH,
										   dropout=dropout,
										   activation_fn=acti,
										   config=test_config,
										   optimizer=opti(learning_rate=learningRate)
										   )
	else:
		logging.info("using custom estimator")
		test_config = estimator.RunConfig(save_checkpoints_steps=100000,
										  save_checkpoints_secs=None,
										  save_summary_steps=500)

		useRatioScaling = False  # Todo: überlegen ob es hierfür noch eine sinnvolle verwendung gibt

		if separator and useRatioScaling:
			medianDim1 = L_train.iloc[:,0].median()
			medianDim2 = L_train.iloc[:,1].median()
			ratio = medianDim1 / medianDim2

			scaleDim1 = 1.0
			scaleDim2 = ratio
			logging.info("scaling loss between different dimensions. ScaleDim2-Ratio: {}".format(ratio))

		else:
			scaleDim1 = 1.0
			scaleDim2 = 1.0
		regressor = estimator.Estimator(
			model_fn=cE.myCustomEstimator,
			config=test_config,
			model_dir=MODEL_PATH,
			params={
				"feature_columns": my_feature_columns,
				"learning_rate": learningRate,
				"optimizer": opti,
				"hidden_units": hidden_layers,
				"dropout": dropout,
				"activation": acti,
				"decaying_learning_rate": True,
				"decay_steps": decaySteps,
				"l1regularization": l1regularization,
				"l2regularization": l2regularization,
				"scaleDim1": scaleDim1,
				"scaleDim2": scaleDim2,
				"regularizationStrength": 5e-08
			})

	hooks = None

	# Debug hooks are handled here
	if DEBUG and tensorboardDebugAddress:
		raise ValueError(
			"The --debug and --tensorboard_debug_address flags are mutually "
			"exclusive.")
	if DEBUG:
		hooks = [tf_debug.LocalCLIDebugHook()]

	# Start tensorboard with debugger port argument: "tensorboard --logdir=./debug2 --debugger_port 6007"
	elif tensorboardDebugAddress:
		hooks = [tf_debug.TensorBoardDebugHook(tensorboardDebugAddress)]
	# hooks = [debug_hook]

	logging.info("Train: ({}, {})".format(F_train.shape, L_train.shape))
	logging.info("Test: ({}, {})".format(F_test.shape, L_test.shape))
	logging.info("Means: \n{}".format(labelMeans))
	logging.info("Stds: \n{}".format(labelStds))


	# Train it
	if TRAINING:

		if not os.path.exists(MODEL_PATH + '/meanstd.pkl'):
			with open(MODEL_PATH + "/meanstd.pkl", 'wb') as f:
				pickle.dump([labelMeans, labelStds], f)
		else:
			with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f:
				[labelMeansTemp, labelStdsTemp] = pickle.load(f)

				if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float?
					logging.warning("CAREFUL: LabelMeans or LabelStds do not match existing values! Training with new values")

		logging.info('Train the DNN Regressor...\n')
		# test = tf.train.get_or_create_global_step()
		# logging.info("test: {}".format(test))

		epochInterm = []
		startTimeTraining = timer()

		for epoch in range(EPOCHS):

			# Fit the DNNRegressor
			# regressor.train(input_fn=training_input_fn(batch_size=BATCH_SIZE), steps=STEPS_PER_EPOCH)
			regressor.train(input_fn=lambda: training_input_fn_Slices(F_train, L_train, BATCH_SIZE),
			                steps=STEPS_PER_EPOCH, hooks=hooks)


			# Start Tensorboard in Terminal:
			# 	tensorboard --logdir='./DNNRegressors/'
			# Now open Browser and visit localhost:6006\

			if epoch % 10 == 0:
				logging.info("Progress: epoch " + str(epoch))
				# logging.info("Progress: global step: {}".format(tf.train.get_global_step()))

				eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE))
				logging.info("eval: " + str(eval_dict))

				avgLoss = eval_dict['average_loss']
				epochInterm.append(avgLoss)

				# optional canceling of training upon hitting a specified loss threshold
				if cancelThreshold is not None:
					if avgLoss < cancelThreshold:
						logging.info("reached cancel Threshold. finishing training")
						break

			if progressPlot and epoch in pos:
				# TODO: adapt or remove because of standardize and normalize
				debug_pred = regressor.predict(input_fn=lambda: eval_input_fn(featureVals, labels=None, batch_size=BATCH_SIZE))
				debug_predicted = [p['predictions'] for p in debug_pred]
				predictions.append(debug_predicted)

		eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE))

		logging.info("Training completed. final average loss: {}, best average loss during training: {}".format(
						eval_dict['average_loss'], min(epochInterm)))

		endTimeTraining = timer()
		timeTotal = endTimeTraining - startTimeTraining
		hours = timeTotal // 3600
		timeTotal %= 3600
		minutes = timeTotal // 60
		timeTotal %= 60
		logging.info("Total Training time: {}h {}min {}s".format(int(hours), int(minutes), int(timeTotal)))

		if progressPlot:
			if FAKE:
				savePath = '/home/hornberger/testFake'
			else:
				savePath = '/home/hornberger/testReal'
			plotTrainDataPandas(featureVals, labelVals, predictions, savePath, units)

	# Evaluation/Prediction
	else:
		logging.info('No training today, just prediction')

		if not os.path.exists(MODEL_PATH + '/meanstd.pkl'):
			logging.warning("Careful: No prior LabelMeans or LabelStds found!")
		else:
			with open(MODEL_PATH + "/meanstd.pkl", 'rb') as f:
				[labelMeansTemp, labelStdsTemp] = pickle.load(f)

				if not ((labelMeansTemp == labelMeans).all() and (labelStdsTemp == labelStds).all()): # does this work with float?
					logging.warning("evaluation on different dataset. replacing current labelMeans and labelStds")

					L_test = L_test * labelStds + labelMeans

					labelMeans = labelMeansTemp
					labelStds = labelStdsTemp

					logging.info("New labelMeans: \n{}".format(labelMeans))
					logging.info("New labelStds: \n{}".format(labelStds))

					L_test = (L_test - labelMeans) / labelStds

		try:
			# Prediction
			eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(F_test, L_test, BATCH_SIZE))
			logging.info('Error on whole Test set:\nMSE (tensorflow): {}'.format(eval_dict['average_loss']))
			averageLoss = eval_dict['average_loss']

		except ValueError as err:
			# probably failed to load model
			logging.error("{}".format(err))
			sys.exit(1)

		except Exception as e:
			logging.error("Unknown Error while trying to evaluate: {}".format(e))
			sys.exit(1)

		assert numberPrint < L_test.shape[0]

		sampleIndex = random.randint(0, L_test.shape[0] - numberPrint)

		# x_pred2 = F_test.iloc[[sampleIndex + i for i in range(numberPrint)]]
		# y_vals2 = L_test.iloc[[sampleIndex + i for i in range(numberPrint)]]

		x_pred2 = F_test.sample(n=numberPrint, random_state=sampleIndex)
		y_vals2 = L_test.sample(n=numberPrint, random_state=sampleIndex)
		y_vals2Denormalized = y_vals2.copy()
		for k in L_test.columns:
			y_vals2Denormalized[k] = y_vals2Denormalized[k] * labelStds[k] + labelMeans[k]

		print(x_pred2)
		print(y_vals2 * labelStds + labelMeans)

		startTime = timer()
		y_predGen = regressor.predict(input_fn=lambda: eval_input_fn(x_pred2, labels=None, batch_size=BATCH_SIZE))
		y_predicted = [p['predictions'] for p in y_predGen]
		endTime = timer()
		print("predicted: ")
		y_predictedCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in y_predicted] # Look, ye mighty, and despair!
		for i in y_predictedCorr:
			print(i)
		print("time: {:.2f}s".format((endTime - startTime)))

		eval_dict = regressor.evaluate(input_fn=lambda: eval_input_fn(x_pred2, y_vals2, batch_size=BATCH_SIZE))
		print('MSE (tensorflow): {}'.format(eval_dict['average_loss']))

		# Maximum Loss Analysis: display the X worst predictions of the testset
		if maximumLossAnalysis:
			if not separator:
				printDF = prepareMaximumLossAnalysisNextStep(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds)
				plotDataNextStepPandas(numberPrint, printDF[columnNames], printDF[['LabelX', 'LabelY']],
								   printDF[['PredictionX', 'PredictionY']], baseImagePath, limits, units,
								   os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf')
			else:
				printDF = prepareMaximumLossAnalysisSeparator(F_test, L_test, numberPrint, regressor, BATCH_SIZE, labelMeans, labelStds)
				# printDF['LabelPosBalken'] = printDF['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken']
				plotDataSeparatorPandas(numberPrint, printDF[columnNames], printDF[['LabelPosBalken']],
										separatorPosition, printDF[['PredictionIntersect']], baseImagePath, limits, units, elementsDirectionBool,
										os.path.basename(MODEL_PATH) + '_' + 'highestLoss' + '_' + time_stamp + '.pdf')
			# print(printDF)

		# displaying weights in Net - (a bit redundant after implementation of debugger)
		if displayWeights:
			for variable in regressor.get_variable_names():
				logging.info("name: \n{}\nvalue: \n{}\n".format(variable, regressor.get_variable_value(variable)))

			weights = regressor.get_variable_value('dense/kernel')
			plt.imshow(weights, cmap='coolwarm')
			plt.show()

		# # Final Plot
		if WITHPLOT:
			L_trainDenormalized = L_train * labelStds + labelMeans
			L_testDenormalized = L_test * labelStds + labelMeans
			if not separator:
				plotDataNextStepPandas(numberPrint, x_pred2, y_vals2Denormalized, y_predictedCorr, baseImagePath, limits, units,
								   os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf')

				totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE))
				totalPredictions = [p['predictions'] for p in totalPredictGen]
				totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair!
				evaluateResultNextStep(F_test, L_testDenormalized, totalPredictionsCorr, units, baseImagePath)

			else:
				# y_vals2Denormalized = y_vals2['LabelPosBalken'] * labelStds['LabelPosBalken'] + labelMeans['LabelPosBalken']
				# y_predictedCorr = list(map(lambda x: [v * labelStds[k] + labelMeans[k] for k,v in enumerate(x)], y_predicted))

				plotDataSeparatorPandas(numberPrint, x_pred2, y_vals2Denormalized['LabelPosBalken'], separatorPosition,
										y_predictedCorr, baseImagePath, limits, units,  elementsDirectionBool,
										os.path.basename(MODEL_PATH) + '_' + time_stamp + '.pdf')
				totalPredictGen = regressor.predict(input_fn=lambda: eval_input_fn(F_test, labels=None, batch_size=BATCH_SIZE))
				totalPredictions = [p['predictions'] for p in totalPredictGen]
				totalPredictionsCorr = [[x * b + c for x, b, c in zip(x, labelStds, labelMeans)] for x in totalPredictions] # Look, ye mighty, and despair!

				filteredFeatures = filterDataForIntersection(F_train, thresholdPoint, elementsDirectionBool)
				medianAccel = getMedianAccel(filteredFeatures, separator, elementsDirectionBool)
				optimalAccel = getOptimalAccel(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool)
				bias = getCVBias(filteredFeatures, L_trainDenormalized.loc[filteredFeatures.index], separatorPosition, elementsDirectionBool)

				configDict = {'medAc': medianAccel, 'optAc': optimalAccel, 'cvBias': bias}

				evaluateResultSeparator(F_test, L_testDenormalized, totalPredictionsCorr, separatorPosition, thresholdPoint,
										configDict, units, baseImagePath, elementsDirectionBool)
Exemple #22
0
def train(args: Namespace, data_params: MoleculeData, experiment: Experiment,
          mol_metrics: GraphMolecularMetrics) -> None:
    ds_train = create_dataflow(args.data_dir, 'train', args.batch_size)

    ds_train_repeat = PrefetchDataZMQ(ds_train, nr_proc=1)
    # times 2, because we consume 2 batches per step
    ds_train_repeat = RepeatedData(ds_train_repeat, 2 * args.epochs)

    train_input_fn = experiment.make_train_fn(ds_train_repeat, args.batch_size,
                                              args.num_latent, data_params)

    def hooks_fn(train_ops: MolGANTrainOps,
                 train_steps: tfgan.GANTrainSteps) -> EstimatorTrainHooks:
        if train_ops.valuenet_train_op is not None:
            generator_hook = FeedableTrainOpsHook(
                train_ops.generator_train_op,
                train_steps.generator_train_steps,
                train_input_fn,
                return_feed_dict=False)

            discriminator_hook = WithRewardTrainOpsHook([
                train_ops.discriminator_train_op, train_ops.valuenet_train_op
            ], train_steps.discriminator_train_steps, train_input_fn,
                                                        mol_metrics)
        else:
            generator_hook = FeedableTrainOpsHook(
                train_ops.generator_train_op,
                train_steps.generator_train_steps,
                train_input_fn,
                return_feed_dict=True)

            discriminator_hook = FeedableTrainOpsHook(
                train_ops.discriminator_train_op,
                train_steps.discriminator_train_steps, train_input_fn)
        return [generator_hook, discriminator_hook]

    model = experiment.make_model_fn(args, data_params, hooks_fn)

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    # enable XLA JIT
    # sess_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    config = tf.estimator.RunConfig(model_dir=str(args.model_dir),
                                    session_config=sess_config,
                                    save_summary_steps=ds_train.size(),
                                    save_checkpoints_secs=None,
                                    save_checkpoints_steps=4 * ds_train.size(),
                                    keep_checkpoint_max=2)

    estimator = tf.estimator.Estimator(model.model_fn, config=config)

    train_hooks = [PrintParameterSummary()]
    if args.restore_from_checkpoint is not None:
        train_hooks.append(
            RestoreFromCheckpointHook(str(args.restore_from_checkpoint)))

    if args.debug:
        from tensorflow.python import debug as tf_debug

        train_hooks.append(tf_debug.TensorBoardDebugHook("localhost:6064"))

    predict_fn = experiment.make_predict_fn(args.data_dir,
                                            args.num_latent,
                                            n_samples=1000,
                                            batch_size=1000)
    ckpt_listener = PredictAndEvalMolecule(estimator, predict_fn, mol_metrics,
                                           str(args.model_dir))

    hparams_setter = [
        ScheduledHyperParamSetter('generator_learning_rate:0',
                                  args.generator_learning_rate,
                                  [(80, 0.5 * args.generator_learning_rate),
                                   (150, 0.1 * args.generator_learning_rate),
                                   (200, 0.01 * args.generator_learning_rate)],
                                  steps_per_epoch=ds_train.size()),
        ScheduledHyperParamSetter(
            'discriminator_learning_rate:0',
            args.discriminator_learning_rate,
            [(80, 0.5 * args.discriminator_learning_rate),
             (150, 0.1 * args.discriminator_learning_rate),
             (200, 0.01 * args.discriminator_learning_rate)],
            steps_per_epoch=ds_train.size())
    ]
    train_hooks.extend(hparams_setter)

    if args.weight_reward_loss > 0:
        if args.weight_reward_loss_schedule == 'linear':
            lambda_setter = ScheduledHyperParamSetter(
                model.params, 'lam',
                [(args.reward_loss_delay, 1.0),
                 (args.epochs, 1.0 - args.weight_reward_loss)], True)
        elif args.weight_reward_loss_schedule == 'const':
            lambda_setter = ScheduledHyperParamSetter(
                model.params, 'lam',
                [(args.reward_loss_delay + 1, 1.0 - args.weight_reward_loss)],
                False)
        else:
            raise ValueError('unknown schedule: {!r}'.format(
                args.weight_reward_loss_schedule))

        hparams_setter.append(lambda_setter)

    train_start = time.time()
    estimator.train(train_input_fn,
                    hooks=train_hooks,
                    saving_listeners=[ckpt_listener])
    train_end = time.time()

    time_d = datetime.timedelta(seconds=int(train_end - train_start))
    LOG.info('Training for %d epochs finished in %s', args.epochs, time_d)
Exemple #23
0
def train():
    """Trains the model."""
    if args.verbose:
        tf.logging.set_verbosity(tf.logging.INFO)

    files = pc_io.get_files(args.train_glob)
    points = pc_io.load_points(files)
    files_cat = np.array(
        [os.path.split(os.path.split(x)[0])[1] for x in files])
    for cat in files_cat:
        assert (cat == 'train') or (cat == 'eval')
    TRAIN_DATASET = points[files_cat == 'train']
    EVAL_DATASET = points[files_cat == 'eval']
    assert (len(TRAIN_DATASET) + len(EVAL_DATASET) == len(points))

    config = tf.estimator.RunConfig(
        keep_checkpoint_every_n_hours=1,
        save_checkpoints_secs=args.save_checkpoints_secs,  # 600
        keep_checkpoint_max=args.keep_checkpoint_max,  # 50
        log_step_count_steps=args.log_step_count_steps,  # 100
        save_summary_steps=args.save_summary_steps,  # 100
        tf_random_seed=42)

    estimator = tf.estimator.Estimator(
        model_fn=compression_model_2048.model_fn,
        model_dir=args.checkpoint_dir,
        config=config,
        params={
            'num_points': args.num_point,
            'batch_size': args.batch_size,
            'knn': args.knn,
            'alpha': args.alpha,
            'gamma': args.gamma,
            'lmbda': args.lmbda,
            'additional_metrics': not args.no_additional_metrics,
            'checkpoint_dir': args.checkpoint_dir,
            'data_format': DATA_FORMAT  # channels_first
        })

    hooks = None
    if args.debug_address is not None:
        hooks = [tf_debug.TensorBoardDebugHook(args.debug_address)]

    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: compression_model_2048.input_fn(
            TRAIN_DATASET,
            args.batch_size,
            args.preprocess_threads,
            prefetch_size=args.prefetch_size),
        max_steps=args.max_steps,
        hooks=hooks)

    val_spec = tf.estimator.EvalSpec(
        input_fn=lambda: compression_model_2048.input_fn(
            EVAL_DATASET,
            args.batch_size,
            args.preprocess_threads,
            repeat=False,
            prefetch_size=args.prefetch_size),
        steps=None,
        hooks=hooks)
    #
    tf.estimator.train_and_evaluate(estimator, train_spec, val_spec)
Exemple #24
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tf_run_config = tf.estimator.RunConfig(
        model_dir=configDir["model_dir"],
        tf_random_seed=None,
        save_summary_steps=configDir["save_summary_steps"],
        save_checkpoints_steps=configDir["save_checkpoints_steps"],
        session_config=None,
        keep_checkpoint_max=configDir["keep_checkpoint_max"],
        log_step_count_steps=configDir["print_loss_steps"],
        train_distribute=None,
        device_fn=None)

    num_train_steps = None
    num_warmup_steps = None
    if configDir["do_train"]:
        train_examples = len(os.listdir(configDir["train_input"])) * 1000
        num_train_steps = int(train_examples / configDir["train_batch_size"] *
                              configDir["num_train_epochs"])
        num_warmup_steps = int(num_train_steps *
                               configDir["warmup_proportion"])

    model_fn = model_build.model_fn_builder(
        config=configDir,
        model_config=model_config,
        learning_rate=configDir["learning_rate"],
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=tf_run_config,
        params={
            "train_batch_size": configDir["train_batch_size"],
            "eval_batch_size": configDir["val_batch_size"],
            "predict_batch_size": configDir["test_batch_size"]
        },  # params可以传给mofel_fn和input_fn
        warm_start_from=None,
    )

    # 是否生成推断模型。
    if configDir["save_predict_model_for_tfServing"] == 1:
        serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
            {
                "unid": tf.FixedLenFeature([], tf.int64),
                "image/encoded": tf.FixedLenFeature([], tf.string),
                "label": tf.FixedLenFeature([], tf.int64),
            })
        estimator.export_savedmodel(configDir["TFServing_model_path"],
                                    serving_input_receiver_fn,
                                    strip_default_attrs=True)
        return 0

    if configDir["do_train"]:
        trainHookLt = []
        evalHookLt = []
        if configDir["debug"]:
            debug_config = configDir["debug_config"]
            if debug_config["tfdbg"]:
                trainHookLt.append(tfdbg.LocalCLIDebugHook())
            elif configDir["tfdbgtensorboard"]:
                trainHookLt.append(
                    tfdbg.TensorBoardDebugHook(
                        grpc_debug_server_addresses="localhost:11111"))

        if configDir["file_base"]:
            train_input_fn = model_input.file_based_input_fn_builder(
                input_file=configDir["train_input"],
                is_training=True,
                drop_remainder=True,
                batch="train_batch_size")

            val_input_fn = model_input.file_based_input_fn_builder(
                input_file=configDir["val_input"],
                is_training=False,
                drop_remainder=True,
                batch="eval_batch_size")
        else:
            augment_fn = CIFAR10Policy()
            train_genter_fn = model_input.get_generator_fn(
                configDir, configDir["train_input"], True, augment_fn)
            train_input_fn = model_input.input_fn_builder(
                configDir, train_genter_fn, True, True, "train_batch_size")

            # input_files = os.listdir(os.path.join(configDir["DP"], "test"))
            val_genter_fn = model_input.get_generator_fn(
                configDir, configDir["val_input"], False)
            val_input_fn = model_input.input_fn_builder(
                configDir, val_genter_fn, False, True, "eval_batch_size")

        trainSpec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                           max_steps=num_train_steps,
                                           hooks=trainHookLt)
        valSpec = tf.estimator.EvalSpec(
            input_fn=val_input_fn,
            steps=configDir["trainStepVal"],
            throttle_secs=configDir["throttle_secs"],
            hooks=evalHookLt)
        tf.estimator.train_and_evaluate(estimator=estimator,
                                        train_spec=trainSpec,
                                        eval_spec=valSpec)

    if configDir["do_test"]:

        tf.logging.info("***** Running predictions *****")
        tf.logging.info("  Batch size = %d", configDir["test_batch_size"])

        # input_files = os.listdir(os.path.join(configDir["DP"], "test"))
        if configDir["file_base"]:
            predict_input_fn = model_input.file_based_input_fn_builder(
                input_file=configDir["predict_input"],
                is_training=False,
                drop_remainder=True,
                batch="predict_batch_size")
        else:
            predict_genter_fn = model_input.get_generator_fn(
                configDir, configDir["predict_input"], False)
            predict_input_fn = model_input.input_fn_builder(
                configDir, predict_genter_fn, False, True,
                "predict_batch_size")

        wf = open(configDir["test_res_output"], "w", encoding="utf-8")
        for mm, result in enumerate(
                estimator.predict(
                    predict_input_fn,
                    yield_single_examples=True,
                    hooks=[
                        # tfdbg.LocalCLIDebugHook(),
                        # tfdbg.TensorBoardDebugHook(grpc_debug_server_addresses="localhost:11111"),
                    ])):

            tf.logging.info("Processing example: %d" % (mm))
            #------------临时代码------------------------#
            if mm == 10:
                break
            #------------临时代码------------------------#
            example_id = result["unique_ids"]
            predict = result["predict"]
            label = result["label"]
            category_probility = "_".join(
                [str(i) for i in result["category_probility"].tolist()])
            path = result["path"].decode('utf-8')
            wf.write("{}\t{}\t{}\t{}\t{}\n".format(example_id, predict, label,
                                                   category_probility, path))
            if configDir["do_save_conv_image"]:
                conv_image = filter_conv_image(result)
                if not os.path.exists(configDir["conv_image_path"]):
                    os.makedirs(configDir["conv_image_path"])
                numpy_path = os.path.join(configDir["conv_image_path"],
                                          os.path.basename(path)[:-4])
                np.savez(numpy_path, **conv_image)
            if configDir["grad_cam"]:
                cam = result["cam"]
                image = result["image"]

                cam = cam / np.max(cam)
                cam = cv2.resize(cam,
                                 (configDir["resize"], configDir["resize"]))
                image = image / 255

                # Superimposing the visualization with the image.
                show_cam_on_image(image, cam)

        wf.close()
Exemple #25
0
def train(train_model, eval_model=None, hvd=None, debug_port=None):
    if eval_model is not None and 'eval_steps' not in eval_model.params:
        raise ValueError("eval_steps parameter has to be specified "
                         "if eval_model is provided")

    if hvd:
        master_worker = hvd.rank() == 0
    else:
        master_worker = True

    # initializing session parameters
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    if hvd is not None:
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    # defining necessary hooks
    hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)]
    if hvd is not None:
        hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if master_worker:
        checkpoint_dir = train_model.params['logdir']
    else:
        checkpoint_dir = None

    if master_worker:
        if train_model.params['save_checkpoint_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir,
                    save_steps=train_model.params['save_checkpoint_steps']))
        if train_model.params['print_loss_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintLossAndTimeHook(
                    every_steps=train_model.params['print_loss_steps'],
                    model=train_model,
                ))
        if train_model.params['print_samples_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintSamplesHook(
                    every_steps=train_model.params['print_samples_steps'],
                    model=train_model,
                ))

    if eval_model is not None:
        # noinspection PyTypeChecker
        hooks.append(
            RunEvaluationHook(
                every_steps=eval_model.params['eval_steps'],
                model=eval_model,
                last_step=train_model.last_step,
            ), )
    total_time = 0.0
    bench_start = train_model.params.get('bench_start', 10)

    if debug_port:
        hooks.append(
            tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port)))

    # starting training
    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=checkpoint_dir,
            save_summaries_steps=train_model.params['save_summaries_steps'],
            config=sess_config,
            save_checkpoint_secs=None,
            log_step_count_steps=train_model.params['save_summaries_steps'],
            stop_grace_period_secs=300,
            hooks=hooks,
    ) as sess:
        for step, feed_dict in enumerate(
                train_model.data_layer.iterate_forever()):
            if sess.should_stop():
                break
            tm = time.time()
            sess.run(fetches=train_model.train_op, feed_dict=feed_dict)
            if step >= bench_start:
                total_time += time.time() - tm

    if hvd is not None:
        deco_print("Finished training on rank {}".format(hvd.rank()))
    else:
        deco_print("Finished training")

    if step > bench_start:
        deco_print("Avg time per step: {:.3}s".format(1.0 * total_time /
                                                      (step - bench_start)))
    else:
        deco_print("Not enough steps for benchmarking")
Exemple #26
0
def main(argv=None):

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    available_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
    num_gpus = len(available_gpus)
    print("num_gpus : ", num_gpus, available_gpus)

    with tf.Graph().as_default():

        # Get Network class and Optimizer
        global_step = tf.train.get_or_create_global_step()

        # Learning rate decay
        if "SynthText" in FLAGS.train_path:
            boundaries = [40000, 60000]
        else:
            boundaries = [4000, 8000]

        values = [FLAGS.learning_rate / pow(10, i) for i in range(3)]
        learning_rate = tf.train.piecewise_constant(global_step, boundaries,
                                                    values)
        tf.summary.scalar('learning_rate', learning_rate)

        Hmean = tf.Variable(0.0, trainable=False, name='hmean')
        tf.summary.scalar("Hmean", Hmean)

        optimizers = []
        net = RetinaNet(FLAGS.backbone)

        # Multi gpu training code (Define graph)
        tower_grads = []
        tower_extra_update_ops = []
        #tower_train_errs = []
        tower_loc_losses = []
        tower_cls_losses = []
        input_features = net.get_input(is_train=True, num_gpus=num_gpus)

        for gpu_indx in range(num_gpus):
            tower_output = _single_tower(net, gpu_indx,
                                         input_features[gpu_indx],
                                         learning_rate)
            tower_grads.append(
                [x for x in tower_output.grads if x[0] is not None])
            tower_extra_update_ops.append(tower_output.extra_update_ops)
            # tower_train_errs.append(tower_output.error)
            tower_loc_losses.append(tower_output.loc_loss)
            tower_cls_losses.append(tower_output.cls_loss)
            optimizers.append(tower_output.optimizer)

        if FLAGS.use_validation:
            valid_input_feature = net.get_input(is_train=False, num_gpus=1)

            # single gpu validation
            valid_tower_output = _single_tower(net,
                                               FLAGS.valid_device,
                                               valid_input_feature[0],
                                               name='valid')
            tf.summary.scalar("valid_loc_losses", valid_tower_output.loc_loss)
            tf.summary.scalar("valid_cls_losses", valid_tower_output.cls_loss)

        # Merge results
        loc_losses = tf.reduce_mean(tower_loc_losses)
        cls_losses = tf.reduce_mean(tower_cls_losses)
        grads = allreduce_grads(tower_grads)
        train_ops = []

        tf.summary.scalar("train_loc_losses", loc_losses)
        tf.summary.scalar("train_cls_losses", cls_losses)
        tf.summary.image("train_img", input_features[0].image)

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())
        train_ops.append(variables_averages_op)

        # Apply the gradients
        for idx, grad_and_vars in enumerate(grads):
            with tf.name_scope('apply_gradients'), tf.device(
                    tf.DeviceSpec(device_type="GPU", device_index=idx)):
                # apply_gradients may create variables. Make them LOCAL_VARIABLES
                from tensorpack.graph_builder.utils import override_to_local_variable
                with override_to_local_variable(enable=idx > 0):
                    train_ops.append(optimizers[idx].apply_gradients(
                        grad_and_vars,
                        name='apply_grad_{}'.format(idx),
                        global_step=(global_step if idx == 0 else None)))

        with tf.control_dependencies(tower_extra_update_ops[-1]):
            train_op = tf.group(*train_ops, name='train_op')

        # Summary
        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        summary_op = tf.summary.merge(
            [s for s in summaries if 'valid_' not in s.name])

        if FLAGS.use_validation:
            valid_summary_op = tf.summary.merge(
                [s for s in summaries if 'valid_' in s.name])
            valid_summary_writer = tf.summary.FileWriter(
                os.path.join(FLAGS.output, FLAGS.valid_dataset))
        '''
        # Print network structure
        if not os.path.exists(FLAGS.output):
            os.makedirs(os.path.join(FLAGS.output,'best_models'), exist_ok=True)
        param_stats = tf.profiler.profile(tf.get_default_graph())
        sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

        train_info = open(os.path.join(FLAGS.output, 'train_info.txt'),'w')
        train_info.write('total_params: %d\n' % param_stats.total_parameters)
        train_info.write(str(FLAGS.flag_values_dict()))
        train_info.close()
        '''
        # Print configuration
        pprint(FLAGS.flag_values_dict())

        # Define config, init_op, scaffold
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        log_device_placement=False)
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        pretrain_op = load_pytorch_weight(FLAGS.use_bn, net.use_se_block)
        sync_op = _get_post_init_ops()

        # only save global variables
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
        scaffold = tf.train.Scaffold(saver=saver,
                                     init_op=init_op,
                                     summary_op=summary_op,
                                     init_fn=_get_init_pretrained())
        valid_saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
        best_valid_loss = 1e9
        best_valid_acc = -1

        # Define several hooks
        hooks = []
        if FLAGS.use_profile:
            profiler_hook = tf.train.ProfilerHook(save_steps=FLAGS.valid_steps,
                                                  output_dir=FLAGS.output)
            hooks.append(profiler_hook)

        if FLAGS.use_debug:
            from tensorflow.python import debug as tf_debug
            # CLI Debugger
            #            cli_debug_hook = tf_debug.LocalCLIDebugHook()
            #            hooks.append(cli_debug_hook)

            # Tensorboard Debugger
            tfb_debug_hook = tf_debug.TensorBoardDebugHook("127.0.0.1:9900")
            #tfb_debug_hook = tf_debug.TensorBoardDebugHook("a476cc765f91:6007")
            hooks.append(tfb_debug_hook)
        hooks = None if len(hooks) == 0 else hooks

        reset_global_step = tf.assign(global_step, 0)

        pEval = None

        print("---------- session start")
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.output,
                scaffold=scaffold,
                hooks=hooks,
                config=session_config,
                save_checkpoint_steps=FLAGS.valid_steps,
                save_checkpoint_secs=None,
                save_summaries_steps=FLAGS.summary_steps,
                save_summaries_secs=None,
        ) as sess:
            print("---------- open MonitoredTrainingSession")

            if "ICDAR2015" in FLAGS.train_path:
                sess.run(reset_global_step)

            _step = sess.run(global_step)

            if "SynthText" in FLAGS.train_path:
                print("---------- run pretrain op")
                sess.run(pretrain_op)

            print("---------- run sync op")
            sess.run(sync_op)

            print("---------- start training, step=", _step)

            while _step < FLAGS.max_num_steps:
                if sess.should_stop():
                    print("Done! ", _step)
                    break

                # Training
                [step_loc_loss, step_cls_loss, _, _step
                 ] = sess.run([loc_losses, cls_losses, train_op, global_step])

                print(
                    'STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f'
                    % (_step, step_loc_loss + step_cls_loss, step_loc_loss,
                       step_cls_loss),
                    end='\r')

                if _step % 50 == 0:
                    print(
                        'STEP : %d\tTRAIN_TOTAL_LOSS : %.8f\tTRAIN_LOC_LOSS : %.8f\tTRAIN_CLS_LOSS : %.5f'
                        % (_step, step_loc_loss + step_cls_loss, step_loc_loss,
                           step_cls_loss))

                # Periodic synchronization
                if _step % 1000 == 0:
                    sess.run(sync_op)

                # Validation Err
                if FLAGS.use_validation:
                    [valid_step_loc_loss, valid_step_cls_loss,
                     valid_summary] = sess.run([
                         valid_tower_output.loc_loss,
                         valid_tower_output.cls_loss, valid_summary_op
                     ])
                    if valid_summary_writer is not None:
                        valid_summary_writer.add_summary(valid_summary, _step)

                    print(
                        'STEP : %d\tVALID_TOTAL_LOSS : %.8f\tVALID_LOC_LOSS : %.8f\tVALID_CLS_LOSS : %.5f'
                        % (_step, valid_step_loss, valid_step_loc_loss,
                           valid_step_cls_loss))
                    print('=' * 70)

                # Evaluation on ICDAR2015
                if FLAGS.use_evaluation and _step % FLAGS.valid_steps == 0:
                    if "ICDAR2015" in FLAGS.train_path:
                        # reset global step -> scaffold auto save is not working!
                        saver.save(_get_session(sess),
                                   os.path.join(FLAGS.output, 'model.ckpt'),
                                   global_step=_step)

                    try:
                        if pEval is None:
                            print(
                                "Evaluation started at iteration {} on IC15..."
                                .format(_step))
                            eval_cmd = "CUDA_VISIBLE_DEVICES=" + str(FLAGS.valid_device) + \
                                " python test.py" + \
                                " --tune_from=" + os.path.join(FLAGS.output, 'model.ckpt-') + str(_step) + \
                                " --input_size=1024" + \
                                " --output_zip=result_" + FLAGS.test + \
                                " --test=" + FLAGS.test + \
                                " --nms_thresh=0.25"

                            print(eval_cmd)
                            pEval = Popen(eval_cmd,
                                          shell=True,
                                          stdout=PIPE,
                                          stderr=PIPE)

                        elif pEval.poll() is not None:
                            (scorestring, stderrdata) = pEval.communicate()

                            hmean = float(
                                str(scorestring).strip().split(":")[3].split(
                                    ",")[0].split("}")[0].strip())

                            if hmean > best_valid_acc:
                                best_valid_acc = hmean
                                best_model_dir = os.path.join(
                                    FLAGS.output, 'best_models')
                                valid_saver.save(
                                    _get_session(sess),
                                    os.path.join(best_model_dir,
                                                 'model_%.2f' % (hmean * 100)),
                                    global_step=_step)

                            print("test_hmean for {}-th iter : {:.4f}".format(
                                _step, hmean))
                            sess.run(tf.assign(Hmean, hmean))

                            if pEval is not None:
                                pEval.kill()
                            pEval = None

                    except Exception as e:
                        print("exception happened in evaluation ", e)
                        if pEval is not None:
                            pEval.kill()
                        pEval = None
def main():
    tf.logging.set_verbosity(tf.logging.INFO)
    args = parse_arguments()

    # graph_data = load_data_node2vec()
    graph_data = load_data_node2vec(args.data_dir)

    vertex_embedding_params = {
        'embedding_dim': args.embedding_dim,
        'embedding_trainable': False,
        'embedding_checkpoint': tf.train.latest_checkpoint(args.embedding_dir),
    }

    model = make_multilabel_logistic_regression(
        label_task_weight=1.0,
        regularization=args.global_regularization,
        global_optimizer=make_optimizer(args),
        polyak=False)
    hooks = [
        tf.train.LoggingTensorHook(
            {
                'kappa_insample': 'kappa_insample_batch/value',
                'kappa_outsample': 'kappa_outsample_batch/value'
            },
            every_n_secs=30)
    ]

    node_classifier = tf.estimator.Estimator(
        model_fn=model,
        params={
            **vertex_embedding_params, 'num_vertices': graph_data.num_vertices,
            'n_labels': graph_data.num_labels,
            'batch_size': args.batch_size
        },
        model_dir=args.train_dir)

    if args.profile:
        hooks.append(tf.train.ProfilerHook(save_secs=30))

    if args.debug:
        from tensorflow.python import debug as tfdbg
        hooks.append(tfdbg.TensorBoardDebugHook('localhost:6004'))

    # train model
    dataset_fn_train = get_dataset_fn(args.sampler, args)

    node_classifier.train(input_fn=make_input_fn(graph_data, args,
                                                 dataset_fn_train),
                          max_steps=args.max_steps_logistic,
                          hooks=hooks)

    pred_features = {
        'vertex_index':
        np.expand_dims(np.array(range(graph_data.num_vertices)), 1)
    }

    def make_pred_dataset():
        dataset = tf.data.Dataset.from_tensor_slices(pred_features)
        return dataset

    print('======= Computing Predictions for logistic regression ========')
    predictions = node_classifier.predict(input_fn=make_pred_dataset,
                                          yield_single_examples=False)

    # get test set
    rng = np.random.RandomState(args.seed)
    in_train = rng.binomial(1,
                            1 - args.proportion_censored,
                            size=graph_data.num_vertices).astype(np.int32)
    in_test = np.logical_not(in_train)

    pred_prob_list = []
    for prediction in predictions:
        pred_prob_list += [prediction['probabilities']]
    pred_probs = np.concatenate(pred_prob_list)

    num_labels = graph_data.labels.shape[1]
    classes = np.array(range(num_labels))

    top_k_list = list(np.sum(graph_data.labels[in_test], 1).astype(np.int))
    pred_labels = predict(pred_probs[in_test], classes, top_k_list)

    mlb = MultiLabelBinarizer(classes)
    pred_labels = mlb.fit_transform(pred_labels)

    print('======= Result for logistic regression ========')
    f1_macro = f1_score(graph_data.labels[in_test],
                        pred_labels,
                        average='macro')
    f1_micro = f1_score(graph_data.labels[in_test],
                        pred_labels,
                        average='micro')
    print("f1_macro: {}".format(f1_macro))
    print("f1_micro: {}".format(f1_micro))

    # test model
    dataset_fn_test = get_dataset_fn(
        args.sampler_test if args.sampler_test is not None else args.sampler,
        args)

    node_classifier.evaluate(input_fn=make_input_fn(graph_data, args,
                                                    dataset_fn_test, 1000),
                             hooks=hooks)
Exemple #28
0
def train():
    """Trains the model."""

    if args.verbose:
        tf.logging.set_verbosity(tf.logging.INFO)

    p_min, p_max, dense_tensor_shape = pc_io.get_shape_data(args.resolution)
    files = pc_io.get_files(args.train_glob)
    points = pc_io.load_points(files, p_min, p_max)

    files_cat = np.array(
        [os.path.split(os.path.split(x)[0])[1] for x in files])
    for cat in files_cat:
        assert (cat == 'train') or (cat == 'test')
    points_train = points[files_cat == 'train']
    points_test = points[files_cat == 'test']

    assert (len(points_train) + len(points_test) == len(points))

    config = tf.estimator.RunConfig(
        keep_checkpoint_every_n_hours=1,
        save_checkpoints_secs=args.save_checkpoints_secs,
        keep_checkpoint_max=args.keep_checkpoint_max,
        log_step_count_steps=args.log_step_count_steps,
        save_summary_steps=args.save_summary_steps,
        tf_random_seed=42)
    estimator = tf.estimator.Estimator(model_fn=compression_model.model_fn,
                                       model_dir=args.checkpoint_dir,
                                       config=config,
                                       params={
                                           'num_filters': args.num_filters,
                                           'alpha': args.alpha,
                                           'gamma': args.gamma,
                                           'lmbda': args.lmbda,
                                           'additional_metrics':
                                           not args.no_additional_metrics,
                                           'checkpoint_dir':
                                           args.checkpoint_dir,
                                           'data_format': DATA_FORMAT
                                       })

    hooks = None
    if args.debug_address is not None:
        hooks = [tf_debug.TensorBoardDebugHook(args.debug_address)]

    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: compression_model.input_fn(points_train,
                                                    args.batch_size,
                                                    dense_tensor_shape,
                                                    args.preprocess_threads,
                                                    prefetch_size=args.
                                                    prefetch_size),
        max_steps=args.max_steps,
        hooks=hooks)
    val_spec = tf.estimator.EvalSpec(
        input_fn=lambda: compression_model.input_fn(points_test,
                                                    args.batch_size,
                                                    dense_tensor_shape,
                                                    args.preprocess_threads,
                                                    repeat=False,
                                                    prefetch_size=args.
                                                    prefetch_size),
        steps=None,
        hooks=hooks)

    tf.estimator.train_and_evaluate(estimator, train_spec, val_spec)
Exemple #29
0
def train(train_model, eval_model=None, debug_port=None, custom_hooks=None):
  if eval_model is not None and 'eval_steps' not in eval_model.params:
    raise ValueError("eval_steps parameter has to be specified "
                     "if eval_model is provided")
  hvd = train_model.hvd
  if hvd:
    master_worker = hvd.rank() == 0
  else:
    master_worker = True

  # initializing session parameters
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  # pylint: disable=no-member
  sess_config.gpu_options.allow_growth = True
  if hvd is not None:
    # pylint: disable=no-member
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

  if train_model.params.get('use_xla_jit', False):
    sess_config.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)

  # defining necessary hooks
  hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)]
  if custom_hooks:
    for custom_hook in custom_hooks:
      hooks.append(custom_hook(train_model=train_model, eval_model=eval_model))

  if hvd is not None:
    hooks.append(BroadcastGlobalVariablesHook(0))

  if master_worker:
    checkpoint_dir = train_model.params['logdir']
    load_model_dir = train_model.params['load_model']
  else:
    checkpoint_dir = None
    load_model_dir = None

  if eval_model is not None:
    # noinspection PyTypeChecker
    hooks.append(
        RunEvaluationHook(
            every_steps=eval_model.params['eval_steps'],
            model=eval_model,
            last_step=train_model.last_step,
            print_ppl=isinstance(eval_model.get_data_layer(), WKTDataLayer),
        ),
    )

  if master_worker:
    if train_model.params['save_checkpoint_steps'] is not None:
      # noinspection PyTypeChecker
      saver = tf.train.Saver(
          save_relative_paths=True,
          max_to_keep=train_model.params['num_checkpoints']
      )
      hooks.append(tf.train.CheckpointSaverHook(
          checkpoint_dir,
          saver=saver,
          save_steps=train_model.params['save_checkpoint_steps'],
      ))
    if train_model.params['print_loss_steps'] is not None:
      # noinspection PyTypeChecker
      hooks.append(PrintLossAndTimeHook(
          every_steps=train_model.params['print_loss_steps'],
          model=train_model,
          print_ppl=isinstance(train_model.get_data_layer(), WKTDataLayer),
      ))
    if train_model.params['print_samples_steps'] is not None:
      # noinspection PyTypeChecker
      hooks.append(PrintSamplesHook(
          every_steps=train_model.params['print_samples_steps'],
          model=train_model,
      ))

  total_time = 0.0
  bench_start = train_model.params.get('bench_start', 10)

  if debug_port:
    hooks.append(
        tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port))
    )

  if train_model.on_horovod:
    init_data_layer = train_model.get_data_layer().iterator.initializer
  else:
    init_data_layer = tf.group(
        [train_model.get_data_layer(i).iterator.initializer
         for i in range(train_model.num_gpus)]
    )

  # We restore only if the user provides load_model_dir. load_model_dir is the
  # directory containing the checkpoint we want to load partial or all weights
  # from.. Useful for transer learning or if we do not want to overwrite our
  # checkpoint. 
  restoring = load_model_dir and not tf.train.latest_checkpoint(checkpoint_dir)
  if restoring:
    vars_in_checkpoint = {}
    for var_name, var_shape in tf.train.list_variables(load_model_dir):
        vars_in_checkpoint[var_name] = var_shape

    print('VARS_IN_CHECKPOINT:')
    print(vars_in_checkpoint)

    vars_to_load = []
    for var in tf.global_variables():
      var_name = var.name.split(':')[0]
      if var_name in vars_in_checkpoint:
        if var.shape == vars_in_checkpoint[var_name] and 'global_step' not in var_name:
          vars_to_load.append(var)

    print('VARS_TO_LOAD:')
    for var in vars_to_load:
        print(var)

    load_model_fn = tf.contrib.framework.assign_from_checkpoint_fn(
        tf.train.latest_checkpoint(load_model_dir), vars_to_load
    )
    scaffold = tf.train.Scaffold(
        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer),
        init_fn = lambda scaffold_self, sess: load_model_fn(sess)
    )

  else:
    scaffold = tf.train.Scaffold(
        local_init_op=tf.group(tf.local_variables_initializer(), init_data_layer)
    )
  fetches = [train_model.train_op]
  try:
    total_objects = 0.0
    # on horovod num_gpus is 1
    for worker_id in range(train_model.num_gpus):
      fetches.append(train_model.get_num_objects_per_step(worker_id))
  except NotImplementedError:
    deco_print("WARNING: Can't compute number of objects per step, since "
               "train model does not define get_num_objects_per_step method.")

  # starting training
  sess = tf.train.MonitoredTrainingSession(
      scaffold=scaffold,
      checkpoint_dir=checkpoint_dir,
      save_summaries_steps=train_model.params['save_summaries_steps'],
      config=sess_config,
      save_checkpoint_secs=None,
      log_step_count_steps=train_model.params['save_summaries_steps'],
      stop_grace_period_secs=300,
      hooks=hooks)
  step = 0
  num_bench_updates = 0
  while True:
    if sess.should_stop():
      break
    tm = time.time()
    try:
      feed_dict = {}
      iter_size = train_model.params.get('iter_size', 1)
      if iter_size > 1:
        feed_dict[train_model.skip_update_ph] = step % iter_size != 0
      if step % iter_size == 0:
        if step >= bench_start:
          num_bench_updates += 1
        fetches_vals = sess.run(fetches, feed_dict)
      else:
        # necessary to skip "no-update" steps when iter_size > 1
        def run_with_no_hooks(step_context):
          return step_context.session.run(fetches, feed_dict)
        fetches_vals = sess.run_step_fn(run_with_no_hooks)
    except tf.errors.OutOfRangeError:
      break
    if step >= bench_start:
      total_time += time.time() - tm
      if len(fetches) > 1:
        for i in range(train_model.num_gpus):
          total_objects += np.sum(fetches_vals[i + 1])
        if train_model.params['print_bench_info_steps'] is not None:
          if step % train_model.params['print_bench_info_steps'] == 0:
            total_objects_cur = collect_if_horovod(total_objects, hvd,
                                                   mode="sum")
            if master_worker:
              avg_objects = 1.0 * total_objects_cur / total_time
              deco_print("Avg objects per second: {:.3f}".format(avg_objects))

    step += 1
  sess.close()

  if len(fetches) > 1:
    total_objects = collect_if_horovod(total_objects, hvd, mode="sum")

  if master_worker:
    deco_print("Finished training")
    if step > bench_start:
      avg_time = 1.0 * total_time / num_bench_updates
      deco_print("Avg time per step: {:.3f}s".format(avg_time))
      if len(fetches) > 1:
        avg_objects = 1.0 * total_objects / total_time
        deco_print("Avg objects per second: {:.3f}".format(avg_objects))
    else:
      deco_print("Not enough steps for benchmarking")
Exemple #30
0
def train(train_model, eval_model=None, debug_port=None):
    if eval_model is not None and 'eval_steps' not in eval_model.params:
        raise ValueError("eval_steps parameter has to be specified "
                         "if eval_model is provided")
    hvd = train_model.hvd
    if hvd:
        master_worker = hvd.rank() == 0
    else:
        master_worker = True

    # initializing session parameters
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    if hvd is not None:
        sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    # defining necessary hooks
    hooks = [tf.train.StopAtStepHook(last_step=train_model.last_step)]
    if hvd is not None:
        hooks.append(BroadcastGlobalVariablesHook(0))

    if master_worker:
        checkpoint_dir = train_model.params['logdir']
    else:
        checkpoint_dir = None

    if eval_model is not None:
        # noinspection PyTypeChecker
        hooks.append(
            RunEvaluationHook(
                every_steps=eval_model.params['eval_steps'],
                model=eval_model,
                last_step=train_model.last_step,
            ), )

    if master_worker:
        if train_model.params['save_checkpoint_steps'] is not None:
            # noinspection PyTypeChecker
            saver = tf.train.Saver(save_relative_paths=True)
            hooks.append(
                tf.train.CheckpointSaverHook(
                    checkpoint_dir,
                    saver=saver,
                    save_steps=train_model.params['save_checkpoint_steps']), )
        if train_model.params['print_loss_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintLossAndTimeHook(
                    every_steps=train_model.params['print_loss_steps'],
                    model=train_model,
                ))
        if train_model.params['print_samples_steps'] is not None:
            # noinspection PyTypeChecker
            hooks.append(
                PrintSamplesHook(
                    every_steps=train_model.params['print_samples_steps'],
                    model=train_model,
                ))

    total_time = 0.0
    bench_start = train_model.params.get('bench_start', 10)

    if debug_port:
        hooks.append(
            tf_debug.TensorBoardDebugHook("localhost:{}".format(debug_port)))

    if train_model.on_horovod:
        init_data_layer = train_model.get_data_layer().iterator.initializer
    else:
        init_data_layer = tf.group([
            train_model.get_data_layer(i).iterator.initializer
            for i in range(train_model.num_gpus)
        ])

    scaffold = tf.train.Scaffold(local_init_op=tf.group(
        tf.local_variables_initializer(), init_data_layer))
    fetches = [train_model.train_op]
    try:
        total_objects = 0.0
        # on horovod num_gpus is 1
        for worker_id in range(train_model.num_gpus):
            fetches.append(train_model.get_num_objects_per_step(worker_id))
    except NotImplementedError:
        deco_print(
            "WARNING: Can't compute number of objects per step, since "
            "train model does not define get_num_objects_per_step method.")

    # starting training
    with tf.train.MonitoredTrainingSession(
            scaffold=scaffold,
            checkpoint_dir=checkpoint_dir,
            save_summaries_steps=train_model.params['save_summaries_steps'],
            config=sess_config,
            save_checkpoint_secs=None,
            log_step_count_steps=train_model.params['save_summaries_steps'],
            stop_grace_period_secs=300,
            hooks=hooks,
    ) as sess:
        step = 0
        while True:
            if sess.should_stop():
                break
            tm = time.time()
            try:
                fetches_vals = sess.run(fetches)
            except tf.errors.OutOfRangeError:
                break
            if step >= bench_start:
                total_time += time.time() - tm
                if len(fetches) > 1:
                    for i in range(train_model.num_gpus):
                        total_objects += np.sum(fetches_vals[i + 1])
            step += 1

    if hvd is not None:
        deco_print("Finished training on rank {}".format(hvd.rank()))
    else:
        deco_print("Finished training")

    if train_model.on_horovod:
        ending = " on worker {}".format(hvd.rank())
    else:
        ending = ""
    if step > bench_start:
        deco_print("Avg time per step{}: {:.3f}s".format(
            ending, 1.0 * total_time / (step - bench_start)))
        if len(fetches) > 1:
            deco_print("Avg objects per second{}: {:.3f}".format(
                ending, 1.0 * total_objects / total_time))
    else:
        deco_print("Not enough steps for benchmarking{}".format(ending))