Example #1
0
 def _experiment_fn(run_config, hparams):
     # num_epochs can control duration if train_steps isn't
     # passed to Experiment
     train_input = lambda: model.input_fn(
         hparams.train_files,
         num_epochs=hparams.num_epochs,
         batch_size=hparams.train_batch_size,
     )
     # Don't shuffle evaluation data
     eval_input = lambda: model.input_fn(hparams.eval_files,
                                         batch_size=hparams.eval_batch_size,
                                         shuffle=False)
     return tf.contrib.learn.Experiment(
         tf.estimator.Estimator(
             model.generate_model_fn(
                 embedding_size=hparams.embedding_size,
                 # Construct layers sizes with exponetial decay
                 hidden_units=[
                     max(
                         2,
                         int(hparams.first_layer_size *
                             hparams.scale_factor**i))
                     for i in range(hparams.num_layers)
                 ],
                 learning_rate=hparams.learning_rate),
             config=run_config),
         train_input_fn=train_input,
         eval_input_fn=eval_input,
         **experiment_args)
Example #2
0
def run_experiment(hparams):
    """Run the training and evaluate using the high level API"""

    train_input = lambda: model.input_fn(hparams.train_files,
                                         num_epochs=hparams.num_epochs,
                                         batch_size=hparams.train_batch_size)

    # Don't shuffle evaluation data
    eval_input = lambda: model.input_fn(
        hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False)

    train_spec = tf.estimator.TrainSpec(train_input,
                                        max_steps=hparams.train_steps)

    exporter = tf.estimator.FinalExporter(
        'census', model.SERVING_FUNCTIONS[hparams.export_format])
    eval_spec = tf.estimator.EvalSpec(eval_input,
                                      steps=hparams.eval_steps,
                                      exporters=[exporter],
                                      name='census-eval')

    model_fn = model.generate_model_fn(
        embedding_size=hparams.embedding_size,
        # Construct layers sizes with exponetial decay
        hidden_units=[
            max(2, int(hparams.first_layer_size * hparams.scale_factor**i))
            for i in range(hparams.num_layers)
        ],
        learning_rate=hparams.learning_rate)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=hparams.job_dir)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Example #3
0
def train_and_maybe_evaluate(hparams):
  """Run the training and evaluate using the high level API.

  Args:
    hparams: Holds hyperparameters used to train the model as name/value pairs.

  Returns:
    The estimator that was used for training (and maybe eval)
  """
  schema = bookings.read_schema(hparams.schema_file)
  tf_transform_output = tft.TFTransformOutput(hparams.tf_transform_dir)

  train_input = lambda: model.input_fn(
      hparams.train_files,
      tf_transform_output,
      batch_size=TRAIN_BATCH_SIZE
  )

  eval_input = lambda: model.input_fn(
      hparams.eval_files,
      tf_transform_output,
      batch_size=EVAL_BATCH_SIZE
  )

  train_spec = tf.estimator.TrainSpec(
      train_input, max_steps=hparams.train_steps)

  serving_receiver_fn = lambda: model.example_serving_receiver_fn(
      tf_transform_output, schema)

  exporter = tf.estimator.FinalExporter('bookings', serving_receiver_fn)
  eval_spec = tf.estimator.EvalSpec(
      eval_input,
      steps=hparams.eval_steps,
      exporters=[exporter],
      name='bookings-eval')

  run_config = tf.estimator.RunConfig(
      save_checkpoints_steps=999, keep_checkpoint_max=1)

  serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR)
  run_config = run_config.replace(model_dir=serving_model_dir)

  estimator = model.build_estimator(
      tf_transform_output,

      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i))
          for i in range(NUM_DNN_LAYERS)
      ],
      config=run_config)

  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

  return estimator
Example #4
0
def train_and_maybe_evaluate(train_files, eval_files, hparams):
  """Run the training and evaluate using the high level API.

  Args:
    hparams: Holds hyperparameters used to train the model as name/value pairs.

  Returns:
    The estimator that was used for training (and maybe eval)
  """
  schema = taxi.read_schema('schema.pbtxt')

  train_input = lambda: model.input_fn(
      train_files,
      hparams.tf_transform_dir,
      batch_size=TRAIN_BATCH_SIZE
  )

  eval_input = lambda: model.input_fn(
      eval_files,
      hparams.tf_transform_dir,
      batch_size=EVAL_BATCH_SIZE
  )

  train_spec = tf.estimator.TrainSpec(
      train_input, max_steps=hparams.train_steps)

  serving_receiver_fn = lambda: model.example_serving_receiver_fn(
      hparams.tf_transform_dir, schema)

  exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn)
  eval_spec = tf.estimator.EvalSpec(
      eval_input,
      steps=hparams.eval_steps,
      exporters=[exporter],
      name='chicago-taxi-eval')

  run_config = tf.estimator.RunConfig(
      save_checkpoints_steps=999, keep_checkpoint_max=1)

  serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR)
  run_config = run_config.replace(model_dir=serving_model_dir)

  estimator = model.build_estimator(
      hparams.tf_transform_dir,

      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i))
          for i in range(NUM_DNN_LAYERS)
      ],
      config=run_config)

  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

  return estimator
def predict():
    inp = input_fn()
    input_tensor = inp[0]['input']
    label_tensor = inp[1]['labels']
    print("Creating session")
    sess = tf.Session()
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    tf.train.start_queue_runners(sess, coord)
    stats = PredictionStats()
    try:
        load = tf.saved_model.loader.load(sess, ['serve'], get_latest_export())
        output_tensor = sess.graph.get_tensor_by_name(OUTPUT_TENSOR_NAME +
                                                      ':0')
        print('Predicting images until Ctrl+C is pressed')
        while True:
            data, label = sess.run([input_tensor, label_tensor])
            pred = sess.run(output_tensor, {INPUT_TENSOR_NAME + ':0': data})
            stats.add_predictions(pred, label)
    except KeyboardInterrupt:
        pass
    finally:
        coord.request_stop()
        coord.join()
        sess.close()
    print()
    stats.print_result()
Example #6
0
def run_experiment(hparams):
  """Run the training and evaluate using the high level API"""

  train_input = lambda: model.input_fn(
      hparams.train_files,
      num_epochs=hparams.num_epochs,
      batch_size=hparams.train_batch_size
  )

  # Don't shuffle evaluation data
  eval_input = lambda: model.input_fn(
      hparams.eval_files,
      batch_size=hparams.eval_batch_size,
      shuffle=False
  )

  train_spec = tf.estimator.TrainSpec(train_input,
                                      max_steps=hparams.train_steps
                                      )

  exporter = tf.estimator.FinalExporter('census',
          model.SERVING_FUNCTIONS[hparams.export_format])
  eval_spec = tf.estimator.EvalSpec(eval_input,
                                    steps=hparams.eval_steps,
                                    exporters=[exporter],
                                    name='census-eval'
                                    )

  run_config = tf.estimator.RunConfig()
  run_config = run_config.replace(model_dir=hparams.job_dir)
  print('model dir {}'.format(run_config.model_dir))
  estimator = model.build_estimator(
      embedding_size=hparams.embedding_size,
      # Construct layers sizes with exponetial decay
      hidden_units=[
          max(2, int(hparams.first_layer_size *
                     hparams.scale_factor**i))
          for i in range(hparams.num_layers)
      ],
      config=run_config
  )

  tf.estimator.train_and_evaluate(estimator,
                                  train_spec,
                                  eval_spec)
Example #7
0
def train_and_evaluate(args):
    """Trains, evaluates, and serializes the MNIST model defined in model.py

    Args:
      args: (Parsed arguments obj) An object containing all parsed arguments.
    """
    # Define running config.
    run_config = tf.estimator.RunConfig(save_checkpoints_steps=6000)

    # Create estimator.
    estimator = model.keras_estimator(
        model_dir=args.model_dir,
        config=run_config,
        learning_rate=args.learning_rate)

    # Create TrainSpec.
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: model.input_fn(
            args.train,
            batch_size=args.batch_size,
            mode=tf.estimator.ModeKeys.TRAIN),
        max_steps=args.steps)

    # Create EvalSpec.
    if os.path.exists('/opt/ml/model'):  # exists if running in AWS SM Container
        container_model_output_dir = '/opt/ml/model'
    else:
        container_model_output_dir = 'exporter'

    exporter = tf.estimator.LatestExporter(container_model_output_dir,
                                           model.serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: model.input_fn(
            args.test,
            batch_size=args.batch_size,
            mode=tf.estimator.ModeKeys.EVAL),
        steps=600,
        exporters=exporter,
        start_delay_secs=10,
        throttle_secs=60)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Example #8
0
def evaluate():
    tf.logging.set_verbosity(tf.logging.INFO)
    input_dict, label_dict = input_fn()
    nn = model_fn(input_dict, None, tf.estimator.ModeKeys.PREDICT)
    stats = PredictionStats()
    sess = get_session()
    coord = tf.train.Coordinator()
    tf.train.start_queue_runners(sess, coord)
    try:
        while True:
            pred, act = sess.run(
                [nn.predictions['predictions'], label_dict['labels']])
            stats.add_predictions(pred, act)
            print('Predictions: %00000d, Accuracy: %.4f' %
                  (stats.get_amount(), stats.get_accuracy()))
    except KeyboardInterrupt:
        pass
    finally:
        coord.request_stop()
        coord.join()
        sess.close()
    print()
    stats.print_result()
def main(unused_argv):

    # Set the random seed for the whole graph for reproductible experiments
    tf.random.set_seed(230)
    print("TensorFlow version: ", tf.__version__)
    assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."
    tf.get_logger().setLevel(logging.ERROR)
    # strategy = tf.compat.v2.distribute.MirroredStrategy()

    # ste the gpu (device:GPU:0)
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        # Restrict TensorFlow to only use the first GPU
        try:
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus),
                  "Logical GPU")
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            print(e)

    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('data_dir')
    flags.mark_flag_as_required('stn_dir')

    # Load the parameters from json file
    json_path = os.path.join(FLAGS.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path)

    # check if the data is available
    assert os.path.exists(FLAGS.data_dir), "No data file found at {}".format(
        FLAGS.data_dir)

    # check if the log file is available
    if not os.path.exists(FLAGS.loging_dir):
        os.mkdir(FLAGS.loging_dir)

    train_data_dir = os.path.join(FLAGS.data_dir, 'train')
    eval_data_dir = os.path.join(FLAGS.data_dir, 'eval')

    # Get the filenames from the train and dev sets
    train_filenames = [
        os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir)
    ]
    eval_filenames = [
        os.path.join(eval_data_dir, f) for f in os.listdir(eval_data_dir)
    ]

    # Get the train images list
    images_list_train = glob.glob(train_filenames[0] + '/*.jpg')
    images_list_eval = glob.glob(eval_filenames[0] + '/*.jpg')

    # Get the label forces
    force_list_train = load_force_txt(train_filenames[1] + '/force.txt',
                                      len(images_list_train))
    force_list_eval = load_force_txt(eval_filenames[1] + '/force.txt',
                                     len(images_list_eval))

    # Specify the sizes of the dataset we train on and evaluate on
    params.train_size = len(images_list_train)
    params.eval_size = len(images_list_eval)

    # Create the two iterators over the two datasets
    print('=================================================')
    print(
        '[INFO] Dataset is built by {0} training images and {1} eval images '.
        format(len(images_list_train), len(images_list_eval)))

    tf.debugging.set_log_device_placement(False)
    train_dataset = input_fn(True,
                             images_list_train,
                             force_list_train,
                             params=params)
    eval_dataset = input_fn(False,
                            images_list_eval,
                            force_list_eval,
                            params=params)
    print('[INFO] Data pipeline is built')

    # Define the model
    print('=================================================')
    print('[INFO] Creating the model...')
    stn_module = tf.keras.models.load_model(FLAGS.stn_dir)
    model_spec = model_fn(FLAGS.mode, params, stn_module)
    if FLAGS.verbose:
        model_spec['model'].summary()

    # Train the model
    print('=================================================')
    train_model = Train_and_Evaluate(model_spec, train_dataset, eval_dataset,
                                     FLAGS.loging_dir)
    train_model.train_and_eval(params)
    print('=================================================')
Example #10
0
File: train.py Project: Irlyue/svhn
 def train_input_fn():
     return input_fn(*data['train'],
                     batch_size=config['batch_size'],
                     n_epochs=config['n_epochs'],
                     shuffle=True)
Example #11
0
    raw_images_list_train = glob.glob(train_filenames[0] + '/*.jpg')
    raw_images_list_eval = glob.glob(eval_filenames[0] + '/*.jpg')

    # Specify the sizes of the dataset we train on and evaluate on
    params.train_size = len(aligned_images_list_train)
    params.eval_size = len(aligned_images_list_eval)

    # Create the two iterators over the two datasets
    print('=================================================')
    print(
        '[INFO] Dataset is built by {0} training images and {1} eval images '.
        format(len(aligned_images_list_train), len(aligned_images_list_eval)))

    tf.debugging.set_log_device_placement(args.v)
    train_dataset = input_fn(True,
                             raw_images_list_train,
                             aligned_images_list_train,
                             params=params)
    eval_dataset = input_fn(False,
                            raw_images_list_eval,
                            aligned_images_list_eval,
                            params=params)
    print('[INFO] Data pipeline is built')

    # Define the model
    print('=================================================')
    print('[INFO] Creating the model...')
    model_spec = model_fn(args.mode, params)
    if args.v:
        model_spec['model'].summary()

    # Train the model
Example #12
0
    # Load Vocabularies
    words = tf.contrib.lookup.index_table_from_file(path_words,
                                                    num_oov_buckets=1)

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    train_sentences = load_dataset_from_text(path_train_sentences)

    eval_sentences = load_dataset_from_text(path_eval_sentences)

    # Specify other parameters for the dataset and the model
    params.eval_size = params.dev_size
    params.buffer_size = params.train_size  # buffer size for shuffling
    params.id_pad_word = words.lookup(tf.constant(params.pad_word))

    # Create the two iterators over the two datasets
    train_inputs = input_fn('train', train_sentences, words, params)
    eval_inputs = input_fn('eval', eval_sentences, words, params)
    logging.info("- done.")

    # Define the models (2 different set of nodes that share weights for train and eval)
    logging.info("Creating the model...")
    train_model_spec = model_fn('train', train_inputs, params)
    eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
    logging.info("- done.")

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    train_and_evaluate(train_model_spec, eval_model_spec, args.model_dir,
                       params, args.restore_dir)
Example #13
0
def run(cluster_spec, target, is_chief, train_steps, job_dir, train_files,
        eval_files, num_epochs, learning_rate):
    num_channels = 6
    hooks = list()
    # does not work well in distributed mode cause it only counts local steps (I think...)
    hooks.append(tf.train.StopAtStepHook(train_steps))

    if is_chief:
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():
            # Features and label tensors
            image, ground_truth, name = model.input_fn(eval_files,
                                                       1,
                                                       shuffle=False,
                                                       shared_name=None)
            # Returns dictionary of tensors to be evaluated
            metric_dict = model.model_fn(model.EVAL, name, image, ground_truth,
                                         num_channels, learning_rate)
            # hook that performs evaluation separate from training
            hooks.append(
                EvaluationRunHook(job_dir, metric_dict, evaluation_graph))
        hooks.append(CheckpointExporterHook(job_dir))

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            image, ground_truth, name = model.input_fn(
                train_files,
                num_epochs,
                shuffle=True,
                shared_name='train_queue')

            # Returns the training graph and global step tensor
            train_op, log_hook, train_summaries = model.model_fn(
                model.TRAIN, name, image, ground_truth, num_channels,
                learning_rate)
            # Hook that logs training to the console
            hooks.append(log_hook)

            train_summary_hook = tf.train.SummarySaverHook(
                save_steps=1,
                output_dir=get_summary_dir(job_dir),
                summary_op=train_summaries)
            hooks.append(train_summary_hook)

        # Creates a MonitoredSession for training
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=job_dir,
                hooks=hooks,
                save_checkpoint_secs=60 * 3,
                save_summaries_steps=1,
                log_step_count_steps=5) as session:
            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while not session.should_stop():
                session.run(train_op)
Example #14
0
    # Get the label forces 
    force_list_train = load_force_txt(train_filenames[1]+ '/force.txt',len(images_list_train))
    force_list_eval = load_force_txt(eval_filenames[1]+ '/force.txt',len(images_list_eval))

    # Specify the sizes of the dataset we train on and evaluate on
    params.train_size = len(images_list_train)
    params.eval_size = len(images_list_eval)

    # Create the two iterators over the two datasets
    print('=================================================')
    print('[INFO] Dataset is built by {0} training images and {1} eval images '
            .format(len(images_list_train), len(images_list_eval)))

    tf.debugging.set_log_device_placement(args.v)
    train_dataset = input_fn(True, images_list_train, force_list_train, params= params)
    eval_dataset  = input_fn(False, images_list_eval, force_list_eval, params= params)
    print('[INFO] Data pipeline is built')

    # Define the model
    print('=================================================')
    print('[INFO] Creating the model...')
    model_spec = model_fn(args.mode, params) 
    if args.v:
        model_spec['model'].summary()

    # Train the model
    print('=================================================')
    train_model = Train_and_Evaluate(model_spec, train_dataset, eval_dataset, args.log_dir)
    train_model.train_and_eval(params)
    print('=================================================')
Example #15
0
    params = Params(json_path)

    # check if the model directory is available
    assert os.path.exists(args.model_dir), "No model file found at {}".format(args.model_dir)
    model_path = os.path.join(args.model_dir, 'best_full_model_path')

    test_data_dir = os.path.join(args.data_dir, 'test')
    # Get the filenames from the train and dev sets
    test_filenames = [os.path.join(test_data_dir, f) for f in os.listdir(test_data_dir)]
    # Get the train images list
    images_list_test = glob.glob(test_filenames[0] + '/*.jpg')
    # Get the label forces 
    force_list_test = load_force_txt(test_filenames[1]+ '/force.txt',len(images_list_test))
    # Specify the sizes of the dataset we train on and evaluate on
    params.test_size = len(images_list_test)

    # Create the two iterators over the two datasets
    print('=================================================')
    print('[INFO] test data is built by {0} images'.format(len(images_list_test)))
    test_dataset = input_fn(False, images_list_test, force_list_test, params= params)

    # Open the saved  model from log file the model
    print('=================================================')
    loaded_model = tf.saved_model.load(model_path)
    print('[INFO] Model loaded...')

    # Test the model
    print('=================================================')
    test_model = Evaluate(loaded_model, test_dataset)
    test_model.test(params)
    print('=================================================')
Example #16
0
    if args.num_gpus > 0:
        strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.num_gpus)
        config = tf.estimator.RunConfig(train_distribute=strategy)
    else:
        config = tf.estimator.RunConfig()

    estimator = tf.estimator.Estimator(model_fn=model.model_fn,
            model_dir=args.model_dir,
            params={
                'learning_rate': args.learning_rate,
                'hidden_h1': args.hidden_h1,
                'label_size': 10
                },
            config=config
            )

    if args.debug == True:
        tf.logging.set_verbosity(tf.logging.INFO)

    # load data from keras
    train, test = tf.keras.datasets.mnist.load_data()

    train_x, train_y = train
    train_x = np.array(train_x, dtype=np.float32)
    train_y = tf.keras.utils.to_categorical(train_y, 10)

    estimator.train(input_fn=lambda:model.input_fn(train_x, train_y, args.epochs, args.batch_size))

    # exporting model
    estimator.export_savedmodel('saved_model', model.serving_input_receiver_fn)
Example #17
0
 def eval_input_fn():
     return input_fn(*data[config['data']],
                     batch_size=config['batch_size'],
                     shuffle=False)
Example #18
0
def run(target,
        cluster_spec,
        is_chief,
        train_steps,
        eval_steps,
        job_dir,
        train_files,
        eval_files,
        train_batch_size,
        eval_batch_size,
        learning_rate,
        eval_frequency,
        first_layer_size,
        num_layers,
        scale_factor,
        num_epochs,
        export_format):

  """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

  # Calculate the number of hidden units
  hidden_units = [
      max(2, int(first_layer_size * scale_factor**i))
      for i in range(num_layers)
  ]

  # If the server is chief which is `master`
  # In between graph replication Chief is one node in
  # the cluster with extra responsibility and by default
  # is worker task zero. We have assigned master as the chief.
  #
  # See https://youtu.be/la_M6bCV91M?t=1203 for details on
  # distributed TensorFlow and motivation about chief.
  if is_chief:
    tf.logging.info("Created DNN hidden units {}".format(hidden_units))
    evaluation_graph = tf.Graph()
    with evaluation_graph.as_default():

      # Features and label tensors
      features, labels = model.input_fn(
          eval_files,
          num_epochs=None if eval_steps else 1,
          batch_size=eval_batch_size,
          shuffle=False
      )
      # Accuracy and AUROC metrics
      # model.model_fn returns the dict when EVAL mode
      metric_dict = model.model_fn(
          model.EVAL,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=learning_rate
      )

    hooks = [EvaluationRunHook(
        job_dir,
        metric_dict,
        evaluation_graph,
        eval_frequency,
        eval_steps=eval_steps,
    )]
  else:
    hooks = []

  # Create a new graph and specify that as default
  with tf.Graph().as_default():
    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

      # Features and label tensors as read using filename queue
      features, labels = model.input_fn(
          train_files,
          num_epochs=num_epochs,
          batch_size=train_batch_size
      )

      # Returns the training graph and global step tensor
      train_op, global_step_tensor = model.model_fn(
          model.TRAIN,
          features.copy(),
          labels,
          hidden_units=hidden_units,
          learning_rate=learning_rate
      )

    # Creates a MonitoredSession for training
    # MonitoredSession is a Session-like object that handles
    # initialization, recovery and hooks
    # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
    with tf.train.MonitoredTrainingSession(master=target,
                                           is_chief=is_chief,
                                           checkpoint_dir=job_dir,
                                           hooks=hooks,
                                           save_checkpoint_secs=20,
                                           save_summaries_steps=50) as session:
      # Global step to keep track of global number of steps particularly in
      # distributed setting
      step = global_step_tensor.eval(session=session)

      # Run the training graph which returns the step number as tracked by
      # the global step tensor.
      # When train epochs is reached, session.should_stop() will be true.
      while (train_steps is None or
             step < train_steps) and not session.should_stop():
        step, _ = session.run([global_step_tensor, train_op])

    # Find the filename of the latest saved checkpoint file
    latest_checkpoint = tf.train.latest_checkpoint(job_dir)

    # Only perform this if chief
    if is_chief:
      build_and_run_exports(latest_checkpoint,
                            job_dir,
                            model.SERVING_INPUT_FUNCTIONS[export_format],
                            hidden_units)
Example #19
0
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir,
        train_files, eval_files, train_batch_size, eval_batch_size,
        learning_rate, eval_frequency, first_layer_size, num_layers,
        scale_factor, num_epochs, export_format):
    """Run the training and evaluation graph.
  Args:
    target (string): Tensorflow server target
    is_chief (bool): Boolean flag to specify a chief server
    train_steps (int): Maximum number of training steps
    eval_steps (int): Number of steps to run evaluation for at each checkpoint.
      if eval_steps is None, evaluation will run for 1 epoch.
    job_dir (string): Output dir for checkpoint and summary
    train_files (string): List of CSV files to read train data
    eval_files (string): List of CSV files to read eval data
    train_batch_size (int): Batch size for training
    eval_batch_size (int): Batch size for evaluation
    learning_rate (float): Learning rate for Gradient Descent
    eval_frequency (int): Run evaluation frequency every n training steps.
      Do not evaluate too frequently otherwise you will
      pay for performance and do not evaluate too in-frequently
      otherwise you will not know how soon to stop training.
      Use default values to start with
    first_layer_size (int): Size of the first DNN layer
    num_layers (int): Number of hidden layers in the DNN
    scale_factor (float): Decay rate for the size of hidden layers
    num_epochs (int): Maximum number of training data epochs on which to train
    export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format
      for the outputed saved_model binary.
  """

    # Calculate the number of hidden units
    hidden_units = [
        max(2, int(first_layer_size * scale_factor**i))
        for i in range(num_layers)
    ]

    # If the server is chief which is `master`
    # In between graph replication Chief is one node in
    # the cluster with extra responsibility and by default
    # is worker task zero. We have assigned master as the chief.
    #
    # See https://youtu.be/la_M6bCV91M?t=1203 for details on
    # distributed TensorFlow and motivation about chief.
    if is_chief:
        tf.logging.info("Created DNN hidden units {}".format(hidden_units))
        evaluation_graph = tf.Graph()
        with evaluation_graph.as_default():

            # Features and label tensors
            features, labels = model.input_fn(
                eval_files,
                num_epochs=None if eval_steps else 1,
                batch_size=eval_batch_size,
                shuffle=False)
            # Accuracy and AUROC metrics
            # model.model_fn returns the dict when EVAL mode
            metric_dict = model.model_fn(model.EVAL,
                                         features.copy(),
                                         labels,
                                         hidden_units=hidden_units,
                                         learning_rate=learning_rate)

        hooks = [
            EvaluationRunHook(
                job_dir,
                metric_dict,
                evaluation_graph,
                eval_frequency,
                eval_steps=eval_steps,
            )
        ]
    else:
        hooks = []

    # Create a new graph and specify that as default
    with tf.Graph().as_default():
        # Placement of ops on devices using replica device setter
        # which automatically places the parameters on the `ps` server
        # and the `ops` on the workers
        #
        # See:
        # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
        with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):

            # Features and label tensors as read using filename queue
            features, labels = model.input_fn(train_files,
                                              num_epochs=num_epochs,
                                              batch_size=train_batch_size)

            # Returns the training graph and global step tensor
            train_op, global_step_tensor = model.model_fn(
                model.TRAIN,
                features.copy(),
                labels,
                hidden_units=hidden_units,
                learning_rate=learning_rate)

        # Creates a MonitoredSession for training
        # MonitoredSession is a Session-like object that handles
        # initialization, recovery and hooks
        # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession
        with tf.train.MonitoredTrainingSession(
                master=target,
                is_chief=is_chief,
                checkpoint_dir=job_dir,
                hooks=hooks,
                save_checkpoint_secs=20,
                save_summaries_steps=50) as session:
            # Global step to keep track of global number of steps particularly in
            # distributed setting
            step = global_step_tensor.eval(session=session)

            # Run the training graph which returns the step number as tracked by
            # the global step tensor.
            # When train epochs is reached, session.should_stop() will be true.
            while (train_steps is None
                   or step < train_steps) and not session.should_stop():
                step, _ = session.run([global_step_tensor, train_op])

        # Find the filename of the latest saved checkpoint file
        latest_checkpoint = tf.train.latest_checkpoint(job_dir)

        # Only perform this if chief
        if is_chief:
            build_and_run_exports(latest_checkpoint, job_dir,
                                  model.SERVING_INPUT_FUNCTIONS[export_format],
                                  hidden_units)
Example #20
0
# df['signup_date'] = df['signup_date'].apply(lambda x: start + timedelta(days=x))
predict_df['last_service_use_date'] = predict_df[
    'last_service_use_date'].apply(lambda x: start + timedelta(days=x))

# df.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True)

# Get user's recency
predict_df['recency'] = predict_df['last_service_use_date'].apply(
    lambda x: (predict_df.last_service_use_date.max() - x).days)

# Convert True False to 0 & 1
predict_df.loc[predict_df['business_service'] == True,
               'business_service'] = '1'
predict_df.loc[predict_df['business_service'] == False,
               'business_service'] = '0'

predict_df['is_retained'] = 0
# df.loc[df['last_service_use_date'].dt.month.isin([6,7]), 'is_retained'] = 1

predict_df.business_service = predict_df.business_service.astype(str)
predict_df.dropna(inplace=True)

m = build_estimator('model_dir')
predicted_values = list(m.predict(input_fn=lambda: input_fn(predict_df)))
probs = list(m.predict_proba(input_fn=lambda: input_fn(predict_df)))

predict_df['predicted_values'] = predicted_values
predict_df['probs'] = probs

predict_df.to_csv('predicttions.csv')