def _experiment_fn(run_config, hparams): # num_epochs can control duration if train_steps isn't # passed to Experiment train_input = lambda: model.input_fn( hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size, ) # Don't shuffle evaluation data eval_input = lambda: model.input_fn(hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) return tf.contrib.learn.Experiment( tf.estimator.Estimator( model.generate_model_fn( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max( 2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], learning_rate=hparams.learning_rate), config=run_config), train_input_fn=train_input, eval_input_fn=eval_input, **experiment_args)
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn(hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps) exporter = tf.estimator.FinalExporter( 'census', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='census-eval') model_fn = model.generate_model_fn( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], learning_rate=hparams.learning_rate) estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=hparams.job_dir) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_maybe_evaluate(hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = bookings.read_schema(hparams.schema_file) tf_transform_output = tft.TFTransformOutput(hparams.tf_transform_dir) train_input = lambda: model.input_fn( hparams.train_files, tf_transform_output, batch_size=TRAIN_BATCH_SIZE ) eval_input = lambda: model.input_fn( hparams.eval_files, tf_transform_output, batch_size=EVAL_BATCH_SIZE ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( tf_transform_output, schema) exporter = tf.estimator.FinalExporter('bookings', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='bookings-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( tf_transform_output, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def train_and_maybe_evaluate(train_files, eval_files, hparams): """Run the training and evaluate using the high level API. Args: hparams: Holds hyperparameters used to train the model as name/value pairs. Returns: The estimator that was used for training (and maybe eval) """ schema = taxi.read_schema('schema.pbtxt') train_input = lambda: model.input_fn( train_files, hparams.tf_transform_dir, batch_size=TRAIN_BATCH_SIZE ) eval_input = lambda: model.input_fn( eval_files, hparams.tf_transform_dir, batch_size=EVAL_BATCH_SIZE ) train_spec = tf.estimator.TrainSpec( train_input, max_steps=hparams.train_steps) serving_receiver_fn = lambda: model.example_serving_receiver_fn( hparams.tf_transform_dir, schema) exporter = tf.estimator.FinalExporter('chicago-taxi', serving_receiver_fn) eval_spec = tf.estimator.EvalSpec( eval_input, steps=hparams.eval_steps, exporters=[exporter], name='chicago-taxi-eval') run_config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=1) serving_model_dir = os.path.join(hparams.output_dir, SERVING_MODEL_DIR) run_config = run_config.replace(model_dir=serving_model_dir) estimator = model.build_estimator( hparams.tf_transform_dir, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(FIRST_DNN_LAYER_SIZE * DNN_DECAY_FACTOR**i)) for i in range(NUM_DNN_LAYERS) ], config=run_config) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return estimator
def predict(): inp = input_fn() input_tensor = inp[0]['input'] label_tensor = inp[1]['labels'] print("Creating session") sess = tf.Session() sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord) stats = PredictionStats() try: load = tf.saved_model.loader.load(sess, ['serve'], get_latest_export()) output_tensor = sess.graph.get_tensor_by_name(OUTPUT_TENSOR_NAME + ':0') print('Predicting images until Ctrl+C is pressed') while True: data, label = sess.run([input_tensor, label_tensor]) pred = sess.run(output_tensor, {INPUT_TENSOR_NAME + ':0': data}) stats.add_predictions(pred, label) except KeyboardInterrupt: pass finally: coord.request_stop() coord.join() sess.close() print() stats.print_result()
def run_experiment(hparams): """Run the training and evaluate using the high level API""" train_input = lambda: model.input_fn( hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size ) # Don't shuffle evaluation data eval_input = lambda: model.input_fn( hparams.eval_files, batch_size=hparams.eval_batch_size, shuffle=False ) train_spec = tf.estimator.TrainSpec(train_input, max_steps=hparams.train_steps ) exporter = tf.estimator.FinalExporter('census', model.SERVING_FUNCTIONS[hparams.export_format]) eval_spec = tf.estimator.EvalSpec(eval_input, steps=hparams.eval_steps, exporters=[exporter], name='census-eval' ) run_config = tf.estimator.RunConfig() run_config = run_config.replace(model_dir=hparams.job_dir) print('model dir {}'.format(run_config.model_dir)) estimator = model.build_estimator( embedding_size=hparams.embedding_size, # Construct layers sizes with exponetial decay hidden_units=[ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ], config=run_config ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(args): """Trains, evaluates, and serializes the MNIST model defined in model.py Args: args: (Parsed arguments obj) An object containing all parsed arguments. """ # Define running config. run_config = tf.estimator.RunConfig(save_checkpoints_steps=6000) # Create estimator. estimator = model.keras_estimator( model_dir=args.model_dir, config=run_config, learning_rate=args.learning_rate) # Create TrainSpec. train_spec = tf.estimator.TrainSpec( input_fn=lambda: model.input_fn( args.train, batch_size=args.batch_size, mode=tf.estimator.ModeKeys.TRAIN), max_steps=args.steps) # Create EvalSpec. if os.path.exists('/opt/ml/model'): # exists if running in AWS SM Container container_model_output_dir = '/opt/ml/model' else: container_model_output_dir = 'exporter' exporter = tf.estimator.LatestExporter(container_model_output_dir, model.serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: model.input_fn( args.test, batch_size=args.batch_size, mode=tf.estimator.ModeKeys.EVAL), steps=600, exporters=exporter, start_delay_secs=10, throttle_secs=60) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def evaluate(): tf.logging.set_verbosity(tf.logging.INFO) input_dict, label_dict = input_fn() nn = model_fn(input_dict, None, tf.estimator.ModeKeys.PREDICT) stats = PredictionStats() sess = get_session() coord = tf.train.Coordinator() tf.train.start_queue_runners(sess, coord) try: while True: pred, act = sess.run( [nn.predictions['predictions'], label_dict['labels']]) stats.add_predictions(pred, act) print('Predictions: %00000d, Accuracy: %.4f' % (stats.get_amount(), stats.get_accuracy())) except KeyboardInterrupt: pass finally: coord.request_stop() coord.join() sess.close() print() stats.print_result()
def main(unused_argv): # Set the random seed for the whole graph for reproductible experiments tf.random.set_seed(230) print("TensorFlow version: ", tf.__version__) assert version.parse(tf.__version__).release[0] >= 2, \ "This notebook requires TensorFlow 2.0 or above." tf.get_logger().setLevel(logging.ERROR) # strategy = tf.compat.v2.distribute.MirroredStrategy() # ste the gpu (device:GPU:0) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: # Restrict TensorFlow to only use the first GPU try: tf.config.experimental.set_visible_devices(gpus[0], 'GPU') tf.config.experimental.set_memory_growth(gpus[0], True) logical_gpus = tf.config.experimental.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") except RuntimeError as e: # Visible devices must be set before GPUs have been initialized print(e) flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('data_dir') flags.mark_flag_as_required('stn_dir') # Load the parameters from json file json_path = os.path.join(FLAGS.model_dir, 'params.json') assert os.path.isfile( json_path), "No json configuration file found at {}".format(json_path) params = Params(json_path) # check if the data is available assert os.path.exists(FLAGS.data_dir), "No data file found at {}".format( FLAGS.data_dir) # check if the log file is available if not os.path.exists(FLAGS.loging_dir): os.mkdir(FLAGS.loging_dir) train_data_dir = os.path.join(FLAGS.data_dir, 'train') eval_data_dir = os.path.join(FLAGS.data_dir, 'eval') # Get the filenames from the train and dev sets train_filenames = [ os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir) ] eval_filenames = [ os.path.join(eval_data_dir, f) for f in os.listdir(eval_data_dir) ] # Get the train images list images_list_train = glob.glob(train_filenames[0] + '/*.jpg') images_list_eval = glob.glob(eval_filenames[0] + '/*.jpg') # Get the label forces force_list_train = load_force_txt(train_filenames[1] + '/force.txt', len(images_list_train)) force_list_eval = load_force_txt(eval_filenames[1] + '/force.txt', len(images_list_eval)) # Specify the sizes of the dataset we train on and evaluate on params.train_size = len(images_list_train) params.eval_size = len(images_list_eval) # Create the two iterators over the two datasets print('=================================================') print( '[INFO] Dataset is built by {0} training images and {1} eval images '. format(len(images_list_train), len(images_list_eval))) tf.debugging.set_log_device_placement(False) train_dataset = input_fn(True, images_list_train, force_list_train, params=params) eval_dataset = input_fn(False, images_list_eval, force_list_eval, params=params) print('[INFO] Data pipeline is built') # Define the model print('=================================================') print('[INFO] Creating the model...') stn_module = tf.keras.models.load_model(FLAGS.stn_dir) model_spec = model_fn(FLAGS.mode, params, stn_module) if FLAGS.verbose: model_spec['model'].summary() # Train the model print('=================================================') train_model = Train_and_Evaluate(model_spec, train_dataset, eval_dataset, FLAGS.loging_dir) train_model.train_and_eval(params) print('=================================================')
def train_input_fn(): return input_fn(*data['train'], batch_size=config['batch_size'], n_epochs=config['n_epochs'], shuffle=True)
raw_images_list_train = glob.glob(train_filenames[0] + '/*.jpg') raw_images_list_eval = glob.glob(eval_filenames[0] + '/*.jpg') # Specify the sizes of the dataset we train on and evaluate on params.train_size = len(aligned_images_list_train) params.eval_size = len(aligned_images_list_eval) # Create the two iterators over the two datasets print('=================================================') print( '[INFO] Dataset is built by {0} training images and {1} eval images '. format(len(aligned_images_list_train), len(aligned_images_list_eval))) tf.debugging.set_log_device_placement(args.v) train_dataset = input_fn(True, raw_images_list_train, aligned_images_list_train, params=params) eval_dataset = input_fn(False, raw_images_list_eval, aligned_images_list_eval, params=params) print('[INFO] Data pipeline is built') # Define the model print('=================================================') print('[INFO] Creating the model...') model_spec = model_fn(args.mode, params) if args.v: model_spec['model'].summary() # Train the model
# Load Vocabularies words = tf.contrib.lookup.index_table_from_file(path_words, num_oov_buckets=1) # Create the input data pipeline logging.info("Creating the datasets...") train_sentences = load_dataset_from_text(path_train_sentences) eval_sentences = load_dataset_from_text(path_eval_sentences) # Specify other parameters for the dataset and the model params.eval_size = params.dev_size params.buffer_size = params.train_size # buffer size for shuffling params.id_pad_word = words.lookup(tf.constant(params.pad_word)) # Create the two iterators over the two datasets train_inputs = input_fn('train', train_sentences, words, params) eval_inputs = input_fn('eval', eval_sentences, words, params) logging.info("- done.") # Define the models (2 different set of nodes that share weights for train and eval) logging.info("Creating the model...") train_model_spec = model_fn('train', train_inputs, params) eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True) logging.info("- done.") # Train the model logging.info("Starting training for {} epoch(s)".format(params.num_epochs)) train_and_evaluate(train_model_spec, eval_model_spec, args.model_dir, params, args.restore_dir)
def run(cluster_spec, target, is_chief, train_steps, job_dir, train_files, eval_files, num_epochs, learning_rate): num_channels = 6 hooks = list() # does not work well in distributed mode cause it only counts local steps (I think...) hooks.append(tf.train.StopAtStepHook(train_steps)) if is_chief: evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors image, ground_truth, name = model.input_fn(eval_files, 1, shuffle=False, shared_name=None) # Returns dictionary of tensors to be evaluated metric_dict = model.model_fn(model.EVAL, name, image, ground_truth, num_channels, learning_rate) # hook that performs evaluation separate from training hooks.append( EvaluationRunHook(job_dir, metric_dict, evaluation_graph)) hooks.append(CheckpointExporterHook(job_dir)) # Create a new graph and specify that as default with tf.Graph().as_default(): with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue image, ground_truth, name = model.input_fn( train_files, num_epochs, shuffle=True, shared_name='train_queue') # Returns the training graph and global step tensor train_op, log_hook, train_summaries = model.model_fn( model.TRAIN, name, image, ground_truth, num_channels, learning_rate) # Hook that logs training to the console hooks.append(log_hook) train_summary_hook = tf.train.SummarySaverHook( save_steps=1, output_dir=get_summary_dir(job_dir), summary_op=train_summaries) hooks.append(train_summary_hook) # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=60 * 3, save_summaries_steps=1, log_step_count_steps=5) as session: # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while not session.should_stop(): session.run(train_op)
# Get the label forces force_list_train = load_force_txt(train_filenames[1]+ '/force.txt',len(images_list_train)) force_list_eval = load_force_txt(eval_filenames[1]+ '/force.txt',len(images_list_eval)) # Specify the sizes of the dataset we train on and evaluate on params.train_size = len(images_list_train) params.eval_size = len(images_list_eval) # Create the two iterators over the two datasets print('=================================================') print('[INFO] Dataset is built by {0} training images and {1} eval images ' .format(len(images_list_train), len(images_list_eval))) tf.debugging.set_log_device_placement(args.v) train_dataset = input_fn(True, images_list_train, force_list_train, params= params) eval_dataset = input_fn(False, images_list_eval, force_list_eval, params= params) print('[INFO] Data pipeline is built') # Define the model print('=================================================') print('[INFO] Creating the model...') model_spec = model_fn(args.mode, params) if args.v: model_spec['model'].summary() # Train the model print('=================================================') train_model = Train_and_Evaluate(model_spec, train_dataset, eval_dataset, args.log_dir) train_model.train_and_eval(params) print('=================================================')
params = Params(json_path) # check if the model directory is available assert os.path.exists(args.model_dir), "No model file found at {}".format(args.model_dir) model_path = os.path.join(args.model_dir, 'best_full_model_path') test_data_dir = os.path.join(args.data_dir, 'test') # Get the filenames from the train and dev sets test_filenames = [os.path.join(test_data_dir, f) for f in os.listdir(test_data_dir)] # Get the train images list images_list_test = glob.glob(test_filenames[0] + '/*.jpg') # Get the label forces force_list_test = load_force_txt(test_filenames[1]+ '/force.txt',len(images_list_test)) # Specify the sizes of the dataset we train on and evaluate on params.test_size = len(images_list_test) # Create the two iterators over the two datasets print('=================================================') print('[INFO] test data is built by {0} images'.format(len(images_list_test))) test_dataset = input_fn(False, images_list_test, force_list_test, params= params) # Open the saved model from log file the model print('=================================================') loaded_model = tf.saved_model.load(model_path) print('[INFO] Model loaded...') # Test the model print('=================================================') test_model = Evaluate(loaded_model, test_dataset) test_model.test(params) print('=================================================')
if args.num_gpus > 0: strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=args.num_gpus) config = tf.estimator.RunConfig(train_distribute=strategy) else: config = tf.estimator.RunConfig() estimator = tf.estimator.Estimator(model_fn=model.model_fn, model_dir=args.model_dir, params={ 'learning_rate': args.learning_rate, 'hidden_h1': args.hidden_h1, 'label_size': 10 }, config=config ) if args.debug == True: tf.logging.set_verbosity(tf.logging.INFO) # load data from keras train, test = tf.keras.datasets.mnist.load_data() train_x, train_y = train train_x = np.array(train_x, dtype=np.float32) train_y = tf.keras.utils.to_categorical(train_y, 10) estimator.train(input_fn=lambda:model.input_fn(train_x, train_y, args.epochs, args.batch_size)) # exporting model estimator.export_savedmodel('saved_model', model.serving_input_receiver_fn)
def eval_input_fn(): return input_fn(*data[config['data']], batch_size=config['batch_size'], shuffle=False)
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir, train_files, eval_files, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, num_epochs, export_format): """Run the training and evaluation graph. Args: target (string): Tensorflow server target is_chief (bool): Boolean flag to specify a chief server train_steps (int): Maximum number of training steps eval_steps (int): Number of steps to run evaluation for at each checkpoint. if eval_steps is None, evaluation will run for 1 epoch. job_dir (string): Output dir for checkpoint and summary train_files (string): List of CSV files to read train data eval_files (string): List of CSV files to read eval data train_batch_size (int): Batch size for training eval_batch_size (int): Batch size for evaluation learning_rate (float): Learning rate for Gradient Descent eval_frequency (int): Run evaluation frequency every n training steps. Do not evaluate too frequently otherwise you will pay for performance and do not evaluate too in-frequently otherwise you will not know how soon to stop training. Use default values to start with first_layer_size (int): Size of the first DNN layer num_layers (int): Number of hidden layers in the DNN scale_factor (float): Decay rate for the size of hidden layers num_epochs (int): Maximum number of training data epochs on which to train export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format for the outputed saved_model binary. """ # Calculate the number of hidden units hidden_units = [ max(2, int(first_layer_size * scale_factor**i)) for i in range(num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info("Created DNN hidden units {}".format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( eval_files, num_epochs=None if eval_steps else 1, batch_size=eval_batch_size, shuffle=False ) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn( model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate ) hooks = [EvaluationRunHook( job_dir, metric_dict, evaluation_graph, eval_frequency, eval_steps=eval_steps, )] else: hooks = [] # Create a new graph and specify that as default with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue features, labels = model.input_fn( train_files, num_epochs=num_epochs, batch_size=train_batch_size ) # Returns the training graph and global step tensor train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate ) # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (train_steps is None or step < train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, job_dir, model.SERVING_INPUT_FUNCTIONS[export_format], hidden_units)
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir, train_files, eval_files, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, num_epochs, export_format): """Run the training and evaluation graph. Args: target (string): Tensorflow server target is_chief (bool): Boolean flag to specify a chief server train_steps (int): Maximum number of training steps eval_steps (int): Number of steps to run evaluation for at each checkpoint. if eval_steps is None, evaluation will run for 1 epoch. job_dir (string): Output dir for checkpoint and summary train_files (string): List of CSV files to read train data eval_files (string): List of CSV files to read eval data train_batch_size (int): Batch size for training eval_batch_size (int): Batch size for evaluation learning_rate (float): Learning rate for Gradient Descent eval_frequency (int): Run evaluation frequency every n training steps. Do not evaluate too frequently otherwise you will pay for performance and do not evaluate too in-frequently otherwise you will not know how soon to stop training. Use default values to start with first_layer_size (int): Size of the first DNN layer num_layers (int): Number of hidden layers in the DNN scale_factor (float): Decay rate for the size of hidden layers num_epochs (int): Maximum number of training data epochs on which to train export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format for the outputed saved_model binary. """ # Calculate the number of hidden units hidden_units = [ max(2, int(first_layer_size * scale_factor**i)) for i in range(num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info("Created DNN hidden units {}".format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( eval_files, num_epochs=None if eval_steps else 1, batch_size=eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) hooks = [ EvaluationRunHook( job_dir, metric_dict, evaluation_graph, eval_frequency, eval_steps=eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue features, labels = model.input_fn(train_files, num_epochs=num_epochs, batch_size=train_batch_size) # Returns the training graph and global step tensor train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (train_steps is None or step < train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, job_dir, model.SERVING_INPUT_FUNCTIONS[export_format], hidden_units)
# df['signup_date'] = df['signup_date'].apply(lambda x: start + timedelta(days=x)) predict_df['last_service_use_date'] = predict_df[ 'last_service_use_date'].apply(lambda x: start + timedelta(days=x)) # df.rename(columns={'Unnamed: 0': 'user_id'}, inplace=True) # Get user's recency predict_df['recency'] = predict_df['last_service_use_date'].apply( lambda x: (predict_df.last_service_use_date.max() - x).days) # Convert True False to 0 & 1 predict_df.loc[predict_df['business_service'] == True, 'business_service'] = '1' predict_df.loc[predict_df['business_service'] == False, 'business_service'] = '0' predict_df['is_retained'] = 0 # df.loc[df['last_service_use_date'].dt.month.isin([6,7]), 'is_retained'] = 1 predict_df.business_service = predict_df.business_service.astype(str) predict_df.dropna(inplace=True) m = build_estimator('model_dir') predicted_values = list(m.predict(input_fn=lambda: input_fn(predict_df))) probs = list(m.predict_proba(input_fn=lambda: input_fn(predict_df))) predict_df['predicted_values'] = predicted_values predict_df['probs'] = probs predict_df.to_csv('predicttions.csv')