def train_and_evaluate(args): iris_model = model.model_fn() # Train the model from sklearn.datasets import load_iris iris_data = load_iris() # load the iris dataset iris_data.data _X = iris_data.data _y = iris_data.target X = _X from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder() y = ohe.fit_transform(np.reshape(_y, (-1, 1))).toarray() # which is importance for convergence of the neural network #scaler = StandardScaler() #X = scaler.fit_transform(_X) # Split the data set into training and testing X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=2) iris_model.fit(X_train, y_train, verbose=1, batch_size=5, epochs=50) # Save model.h5 on to google storage #iris_model.save('model.pb') #with file_io.FileIO('model.pb', mode='rb') as input_f: # with file_io.FileIO(args.job_dir + '/model.pb', mode='w+') as output_f: # output_f.write(input_f.read()) iris_model.save(os.path.join(args.job_dir, 'export'))
def train_model(args): mnist_model = model.model_fn(args.learning_rate) train_dataset, eval_dataset = model.get_dataset(args.train_batch_size) history = mnist_model.fit(train_dataset, epochs=args.num_epochs) eval_loss, eval_acc = mnist_model.evaluate(eval_dataset) print('Eval loss: {}, Eval Accuracy: {}'.format(eval_loss, eval_acc))
def train_and_evaluate(args): INPUT_DIM = args.training_history*ONE_HOUR CLASS_SIZE = len(bins)+1 hidden_units = args.hidden_units # hidden_units = [int(units) for units in args.hidden_units.split(',')] learning_rate = args.learning_rate disk_model = model.model_fn(INPUT_DIM, CLASS_SIZE, hidden_units,learning_rate ) try: os.makedirs(args.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not args.job_dir.startswith('gs://'): checkpoint_path = os.path.join(args.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint( checkpoint_path, monitor='val_loss', verbose=1, period=args.checkpoint_epochs, mode='min') # Continuous eval callback. # evaluation = ContinuousEval(args.eval_frequency, args.eval_files, # args.learning_rate, args.job_dir) # Tensorboard logs callback. tb_log = TensorBoard( log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint,tb_log] history = disk_model.fit_generator( model.generator_input(args.train_files, args.training_history,args.train_batch_size), validation_data=model.generator_input(args.eval_files, args.training_history,args.eval_batch_size), steps_per_epoch=args.train_steps,validation_steps = 10, epochs=args.num_epochs, callbacks=callbacks) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if args.job_dir.startswith('gs://'): disk_model.save(DISK_MODEL) copy_file_to_gcs(args.job_dir, DISK_MODEL) else: disk_model.save(os.path.join(args.job_dir, DISK_MODEL)) with file_io.FileIO( os.path.join(args.job_dir, 'history'), mode='w+') as output_f: pickle.dump(history.history, output_f)
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, eval_num_epochs, num_epochs, checkpoint_epochs): census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE) try: os.makedirs(job_dir) except: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir) # Tensorboard logs callback tblog = TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True) callbacks = [checkpoint, evaluation, tblog] census_model.fit_generator(model.generator_input(train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=train_steps, epochs=num_epochs, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): census_model.save(CENSUS_MODEL) copy_file_to_gcs(job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel if os.path.exists(os.path.join(job_dir, 'export')): shutil.rmtree(os.path.join(job_dir, 'export')) model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units): """Given the latest checkpoint file export the saved model. Args: latest (string): Latest checkpoint file job_dir (string): Location of checkpoints and model files name (string): Name of the checkpoint to be exported. Used in building the export path. hidden_units (list): Number of hidden units learning_rate (float): Learning rate for the SGD """ prediction_graph = tf.Graph() exporter = tf.saved_model.builder.SavedModelBuilder( os.path.join(job_dir, 'export')) with prediction_graph.as_default(): features, inputs_dict = serving_input_fn() prediction_dict = model.model_fn( model.PREDICT, features.copy(), None, # labels hidden_units=hidden_units, learning_rate=None # learning_rate unused in prediction mode ) saver = tf.train.Saver() inputs_info = { name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in six.iteritems(inputs_dict) } output_info = { name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in six.iteritems(prediction_dict) } signature_def = tf.saved_model.signature_def_utils.build_signature_def( inputs=inputs_info, outputs=output_info, method_name=sig_constants.PREDICT_METHOD_NAME) with tf.Session(graph=prediction_graph) as session: session.run( [tf.local_variables_initializer(), tf.tables_initializer()]) saver.restore(session, latest) exporter.add_meta_graph_and_variables( session, tags=[tf.saved_model.tag_constants.SERVING], signature_def_map={ sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def }, legacy_init_op=main_op()) exporter.save()
def build_and_run_exports(latest, job_dir, serving_input_fn, hidden_units): """Given the latest checkpoint file export the saved model. Args: latest (str): Latest checkpoint file. job_dir (str): Location of checkpoints and model files. serving_input_fn (str): Serving Function hidden_units (list): Number of hidden units. """ prediction_graph = tf.Graph() # Create exporter. exporter = tf.saved_model.builder.SavedModelBuilder( os.path.join(job_dir, 'export')) with prediction_graph.as_default(): features, inputs_dict = serving_input_fn() prediction_dict = model.model_fn( model.PREDICT, features.copy(), None, # labels hidden_units=hidden_units, learning_rate=None # learning_rate unused in prediction mode ) saver = tf.train.Saver() inputs_info = { name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in six.iteritems(inputs_dict) } output_info = { name: tf.saved_model.utils.build_tensor_info(tensor) for name, tensor in six.iteritems(prediction_dict) } signature_def = tf.saved_model.signature_def_utils.build_signature_def( inputs=inputs_info, outputs=output_info, method_name=sig_constants.PREDICT_METHOD_NAME ) with tf.Session(graph=prediction_graph) as session: session.run([tf.local_variables_initializer(), tf.tables_initializer()]) saver.restore(session, latest) exporter.add_meta_graph_and_variables( session, tags=[tf.saved_model.tag_constants.SERVING], signature_def_map={ sig_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def }, legacy_init_op=main_op() ) exporter.save()
def train_and_evaluate(args): census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE) try: os.makedirs(args.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not args.job_dir.startswith('gs://'): checkpoint_path = os.path.join(args.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint( checkpoint_path, monitor='val_loss', verbose=1, period=args.checkpoint_epochs, mode='min') # Continuous eval callback. evaluation = ContinuousEval(args.eval_frequency, args.eval_files, args.learning_rate, args.job_dir) # Tensorboard logs callback. tb_log = TensorBoard( log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tb_log] census_model.fit_generator( model.generator_input(args.train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=args.train_steps, epochs=args.num_epochs, callbacks=callbacks) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if args.job_dir.startswith('gs://'): census_model.save(CENSUS_MODEL) copy_file_to_gcs(args.job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(args.job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel. model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export'))
def dispatch(train_files, eval_files, job_dir, learning_rate, eval_frequency, num_epochs, checkpoint_epochs): # setting the seed for reproducibility np.random.seed(13) forecast_model = model.model_fn() scaler = model.build_scaler(train_files + eval_files) try: os.makedirs(job_dir) except Exception as e: print(e) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, verbose=1, period=checkpoint_epochs) # Continuous eval callback with ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir, scaler) as evaluation: # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join( job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] x, y = model.load_features(train_files, scaler) forecast_model.fit(x, y, epochs=num_epochs, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): forecast_model.save(MODEL_FILENAME) copy_file_to_gcs(job_dir, MODEL_FILENAME) else: forecast_model.save(os.path.join(job_dir, MODEL_FILENAME))
def train_evaluate(model_name, hidden_units, train_file, valid_file, ckpt_folder, optimizer, batch_size, max_steps, lr, eval_steps): estimator = model_fn(model_name, hidden_units, ckpt_folder, optimizer, lr) train_input_fn = lambda: input_fn(file=train_file, batch_size=batch_size, train=True) valid_input_fn = lambda: input_fn(file=valid_file, batch_size=batch_size, train=False) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps) export_latest = tf.estimator.FinalExporter("bclassifier", serving_input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=valid_input_fn, steps=eval_steps, exporters=export_latest) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train_and_evaluate(args): logic_nn_model = model.model_fn(**vars(args)) try: os.makedirs(args.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not args.job_dir.startswith('gs://'): checkpoint_path = os.path.join(args.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=args.checkpoint_epochs, mode='min') # Continuous eval callback. evaluation = ContinuousEval(args.eval_frequency, args.learning_rate, args.job_dir, args.eval_steps) # Tensorboard logs callback. tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tb_log] logic_nn_model.fit_generator(model.generator_input(), steps_per_epoch=args.train_steps, epochs=args.num_epochs, callbacks=callbacks) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if args.job_dir.startswith('gs://'): logic_nn_model.save(LOGICAL_NN_MODEL) copy_file_to_gcs(args.job_dir, LOGICAL_NN_MODEL) else: logic_nn_model.save(os.path.join(args.job_dir, LOGICAL_NN_MODEL)) # Convert the Keras model to TensorFlow SavedModel. model.to_saved_model(logic_nn_model, os.path.join(args.job_dir, 'export')) print("...Finished actions for build and train with model...")
def train_and_evaluate(hparams): census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE) try: os.makedirs(hparams.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not hparams.job_dir.startswith('gs://'): checkpoint_path = os.path.join(hparams.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=hparams.checkpoint_epochs, mode='min') # Continuous eval callback. evaluation = ContinuousEval(hparams.eval_frequency, hparams.eval_files, hparams.learning_rate, hparams.job_dir) # Tensorboard logs callback. tb_log = TensorBoard(log_dir=os.path.join(hparams.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tb_log] census_model.fit_generator(model.generator_input(hparams.train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=hparams.train_steps, epochs=hparams.num_epochs, callbacks=callbacks) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if hparams.job_dir.startswith('gs://'): census_model.save(CENSUS_MODEL) copy_file_to_gcs(hparams.job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(hparams.job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel. model.to_savedmodel(census_model, os.path.join(hparams.job_dir, 'export'))
def dispatch(data_file, job_dir, num_epochs): job_dir = create_job_dir(job_dir) nb_chars, embedding_matrix, x_train, y_train, x_val, y_val = \ model.get_training_data(data_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT, EMBEDDING_FILE_GCS) my_model = model.model_fn(nb_chars, embedding_matrix) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') timestamp = str(time.time()) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), write_graph=True, embeddings_freq=0) callbacks = [checkpoint, tblog] my_model = model.compile_model(my_model) my_model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=num_epochs, batch_size=128, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): my_model.save(MY_MODEL_NAME) copy_file_to_gcs(job_dir, MY_MODEL_NAME) else: my_model.save(os.path.join(job_dir, MY_MODEL_NAME)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(my_model, os.path.join(job_dir, 'export'))
def train_and_evaluate(args): # Showcasing the hypertuning parameters here. # The first-layer-size is being tuned in this example hidden_units = [args.first_layer_size, 70, 50, 20] census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE, hidden_units) try: os.makedirs(args.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not args.job_dir.startswith('gs://'): checkpoint_path = os.path.join(args.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint( checkpoint_path, monitor='val_loss', verbose=1, save_best_only=False, period=args.checkpoint_epochs, mode='min') # Continuous eval callback. evaluation = ContinuousEval(args.eval_frequency, args.eval_files, args.learning_rate, args.job_dir) # Tensorboard logs callback. tb_log = TensorBoard( log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tb_log] census_model.fit_generator( model.generator_input(args.train_files, chunk_size=CHUNK_SIZE), steps_per_epoch=args.train_steps, epochs=args.num_epochs, use_multiprocessing=args.distributed, callbacks=callbacks) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if args.job_dir.startswith('gs://'): census_model.save(CENSUS_MODEL) copy_file_to_gcs(args.job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(args.job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel. model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export')) # The following is for hyperparameter tuning and is adapted from here: https://cloud.google.com/ml-engine/docs/tensorflow/using-hyperparameter-tuning # Note: the last_loss_val is updated after each checkpoint, but we only write the summary once. summary = Summary(value=[Summary.Value(tag='val_loss', simple_value=evaluation.last_loss_val)]) # more hypertune info here: https://cloud.google.com/solutions/machine-learning/recommendation-system-tensorflow-train-cloud-ml-engine job_dir = args.job_dir if args.hypertune: # if tuning, join the trial number to the output path trial = json.loads(os.environ.get('TF_CONFIG', '{}')).get('task', {}).get('trial', '') output_dir = os.path.join(job_dir, trial) else: output_dir = job_dir eval_path = os.path.join(output_dir, 'val_loss') summary_writer = tf.summary.FileWriter(eval_path) # Note: adding the summary to the writer is enough for hyperparameter tuning. # ML Engine looks for any summary added with the hyperparameter metric tag. summary_writer.add_summary(summary) summary_writer.flush()
def run(target, cluster_spec, is_chief, hparams): """Runs the training and evaluation graph. Args: target (str): Tensorflow server target. cluster_spec: (cluster spec) Cluster specification. is_chief (bool): Boolean flag to specify a chief server. hparams (tf.hparams): Input Arguments. """ # Calculate the number of hidden units hidden_units = [ max(2, int(hparams.first_layer_size * hparams.scale_factor**i)) for i in range(hparams.num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info('Created DNN hidden units {}'.format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( hparams.eval_files, num_epochs=None if hparams.eval_steps else 1, batch_size=hparams.eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=hparams.learning_rate) hooks = [ EvaluationRunHook( hparams.job_dir, metric_dict, evaluation_graph, hparams.eval_frequency, eval_steps=hparams.eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default. with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers. # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue. features, labels = model.input_fn( hparams.train_files, num_epochs=hparams.num_epochs, batch_size=hparams.train_batch_size) # Returns the training graph and global step tensor. train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=hparams.learning_rate) # Creates a MonitoredSession for training. # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=hparams.job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (hparams.train_steps is None or step < hparams.train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(hparams.job_dir) # Only perform this if chief if is_chief: build_and_run_exports( latest_checkpoint, hparams.job_dir, model.SERVING_INPUT_FUNCTIONS[hparams.export_format], hidden_units)
def train_and_evaluate(args): CLASS_SIZE = len(bins) + 1 # hidden_units = [int(units) for units in args.hidden_units.split(',')] try: os.makedirs(args.job_dir) except: pass # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_FILE_PATH if not args.job_dir.startswith('gs://'): checkpoint_path = os.path.join(args.job_dir, checkpoint_path) # Model checkpoint callback. checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=args.checkpoint_epochs, mode='min') tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, tb_log] sequential_train = [ int(float(hour)) for hour in args.sequential_train.split(',') ] seq_id = 0 sequential_models = [] weights_all = [] for hours in sequential_train: history_all = [] if seq_id == 0: hidden_units = args.hidden_units learning_rate = args.learning_rate INPUT_DIM = hours * ONE_HOUR ###########fully connected model############# first_model = initial_model.model_fn(INPUT_DIM, CLASS_SIZE, hidden_units, 0.0001) assign_w = 0.03 first_model.compile(loss=initial_model.weighted_loss(assign_w), optimizer=keras.optimizers.Adam(lr=0.0001), metrics=[ initial_model.first_class_accuracy, initial_model.other_class_accuracy, initial_model.single_class_accuracy(1), initial_model.single_class_accuracy(2), initial_model.single_class_accuracy(3), initial_model.single_class_accuracy(4), initial_model.single_class_accuracy(5), initial_model.single_class_accuracy(6), initial_model.single_class_accuracy(7), 'accuracy' ]) train_file_names = os.path.join(args.train_files, str(hours) + 'hrs', 'train', '*npz') eval_file_names = os.path.join(args.eval_files, str(hours) + 'hrs', 'eval', '*npz') print("\n\ntraining " + str(hours) + 'hrs!\n\n') history_all.append( first_model.fit_generator( initial_model.generator_input(train_file_names, args.train_batch_size), validation_data=initial_model.generator_input( eval_file_names, args.eval_batch_size), steps_per_epoch=args.train_steps, validation_steps=args.eval_steps, epochs=args.num_epochs, callbacks=callbacks)) weights = first_model.get_weights() weights_all.append(weights) with open( os.path.join(args.job_dir, 'weights', str(hours) + 'hrs_weights'), 'wb') as fp: pickle.dump(weights, fp) DISK_MODEL = 'disk_model.hdf5' if args.job_dir.startswith('gs://'): first_model.save(DISK_MODEL) copy_file_to_gcs(args.job_dir, DISK_MODEL) else: first_model.save(os.path.join(args.job_dir, DISK_MODEL)) data, label = initial_model.generator_input_once( str(args.train_files) + str(hours) + 'hrs/train/input_' + str(hours) + 'hrs_8.npz', 3) first_model.compile(loss=initial_model.weighted_loss(0.00081), optimizer=keras.optimizers.Adam(lr=0.0001), metrics=[ initial_model.first_class_accuracy, initial_model.other_class_accuracy, initial_model.single_class_accuracy(1), initial_model.single_class_accuracy(2), initial_model.single_class_accuracy(3), initial_model.single_class_accuracy(4), initial_model.single_class_accuracy(5), initial_model.single_class_accuracy(6), initial_model.single_class_accuracy(7), 'accuracy' ]) first_model.fit_generator( initial_model.generator_input(train_file_names, args.train_batch_size), validation_data=initial_model.generator_input( eval_file_names, args.eval_batch_size), steps_per_epoch=args.train_steps, validation_steps=args.eval_steps, epochs=50, callbacks=callbacks) scores = first_model.evaluate(x=data, y=label, batch_size=None, verbose=1, sample_weight=None, steps=1) print("\ntest " + str(hours) + 'hrs after train\n') print(scores) seq_id = seq_id + 1 else: with open( os.path.join( args.job_dir, 'weights', str(sequential_train[seq_id - 1]) + 'hrs_weights'), 'rb') as fp: weights_0 = pickle.load(fp) ######sequential(weights, CONCAT_UNIT_SIZE, INPUT_SHAPE, learning_rate) hours = sequential_train[seq_id] seq = Sequential(weights_0, args.CONCAT_UNIT_SIZE, hours * ONE_HOUR, 'zeros') model = seq.build_sequential_model() # assign_w = 0.016+0.005*seq_id assign_w = 0.03 model.compile(loss=initial_model.weighted_loss(assign_w), optimizer=keras.optimizers.Adam(lr=0.0001), metrics=[ initial_model.first_class_accuracy, initial_model.other_class_accuracy, initial_model.single_class_accuracy(1), initial_model.single_class_accuracy(2), initial_model.single_class_accuracy(3), initial_model.single_class_accuracy(4), initial_model.single_class_accuracy(5), initial_model.single_class_accuracy(6), initial_model.single_class_accuracy(7), 'accuracy' ]) data, label = initial_model.generator_input_once( str(args.train_files) + str(hours) + 'hrs/train/input_' + str(hours) + 'hrs_8.npz', 6) scores = model.evaluate(x=data, y=label, batch_size=None, verbose=1, sample_weight=None, steps=1) print("\ntest " + str(hours) + 'hrs beofre train\n') print(scores) # data,label = initial_model.generator_input_once('/Volumes/TOSHIBA EXT/train_input/24hrs/train/input_24hrs_8.npz', 24) # # scores = model.evaluate(x=data, y=label, batch_size=None, verbose=1, sample_weight=None, steps=1) # print(scores) ###########sequential model############# train_file_names = os.path.join(str(args.train_files), str(hours) + 'hrs', 'train', '*npz') eval_file_names = os.path.join(args.eval_files, str(hours) + 'hrs', 'eval', '*npz') print("\n\ntraining " + str(hours) + 'hrs!\n\n') history_all.append( model.fit_generator( initial_model.generator_input(train_file_names, args.train_batch_size), validation_data=initial_model.generator_input( eval_file_names, args.eval_batch_size), steps_per_epoch=args.train_steps, validation_steps=args.eval_steps, epochs=args.num_epochs, callbacks=callbacks)) weights = model.get_weights() weights_all.append(weights) weights_0 = [] for i in range(int(len(weights) / 4)): if i == int(len(weights) / 4) - 1: weights_0.extend([ np.concatenate((weights[i * 4 + 2], weights[i * 4]), axis=0), (weights[i * 4 + 1] + weights[i * 4 + 3]) ]) elif i == int(len(weights) / 4) - 2: weights_0.extend([ np.concatenate((weights[i * 4 + 2], weights[i * 4]), axis=1), np.concatenate( (weights[i * 4 + 3], weights[i * 4 + 1])) ]) else: weights_0.extend([ np.concatenate((weights[i * 4], weights[i * 4 + 2]), axis=1), np.concatenate( (weights[i * 4 + 1], weights[i * 4 + 3])) ]) # weights_0 = [np.concatenate((weights[0], weights[2]), axis=1), # np.concatenate((weights[1], weights[3])), # np.concatenate((weights[4], weights[6]), axis=1), # np.concatenate((weights[5], weights[7])), # np.concatenate((weights[8], weights[10]), axis=0) # weights[9] + weights[11]] data, label = initial_model.generator_input_once( str(args.train_files) + str(hours) + 'hrs/train/input_' + str(hours) + 'hrs_8.npz', 6) scores = model.evaluate(x=data, y=label, batch_size=None, verbose=1, sample_weight=None, steps=1) print("\ntest " + str(hours) + 'hrs after train\n') print(scores) with open( os.path.join(str(args.job_dir), 'weights', str(hours) + 'hrs_weights'), 'wb') as fp: pickle.dump(weights_0, fp) DISK_MODEL = 'disk_model' + str(hours) + '.hdf5' if args.job_dir.startswith('gs://'): model.save(DISK_MODEL) copy_file_to_gcs(args.job_dir, DISK_MODEL) else: model.save(os.path.join(args.job_dir, DISK_MODEL)) seq_id = seq_id + 1 with open(os.path.join(str(args.job_dir), 'histroy_all'), 'wb') as fp: pickle.dump(history_all, fp)
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, train_batch_size, eval_batch_size, learning_rate, eval_frequency, eval_num_epochs, num_epochs, checkpoint_epochs, image_input_prefix, debug_mode): census_model = model.model_fn() try: os.makedirs(job_dir) except: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) meta_data = get_meta(train_files) indexes = [i for i in range(len(meta_data))] random.shuffle(indexes) meta_data = meta_data.loc[indexes].reset_index(drop=True) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, meta_data, image_input_prefix, eval_files, learning_rate, job_dir, debug_mode) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] train_data_sequence = DataSequence(image_input_prefix, train_files, debug_mode, meta_data, batch_size=train_batch_size, data_type='train') census_model.fit_generator( #model.generator_input(train_files, chunk_size=CHUNK_SIZE), train_data_sequence, steps_per_epoch=train_data_sequence.length, epochs=num_epochs, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): census_model.save(CENSUS_MODEL) copy_file_to_gcs(job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
def dispatch(multi_gpu, train_files, eval_files, job_dir, train_steps, train_batch_size, num_epochs, learning_rate, stddev, eval_steps, eval_batch_size, eval_num_epochs, eval_frequency, checkpoint_epochs, gpus, workers, verbose): """ Main training method: """ # random seed random.seed(42) # load encoder: encoder = Encoder(alphabet=ALPHABET, maxlen=MAXLEN, num_classes=NUM_CLASSES, clear_accents=CLEAR_ACCENTS) # prepare data generator sequences: train_sequence = DataSequence( input_file=train_files, label_column=LABEL_COLUMN, data_columns=DATA_COLUMNS, encoder=encoder, backwards=REVERSE_ENCODING, batch_size=train_batch_size, # workaround bc sequence.__len__ overwrites fit_generator arg steps_per_epoch=train_steps, shuffle=SHUFFLE) eval_sequence = DataSequence( input_file=eval_files, label_column=LABEL_COLUMN, data_columns=DATA_COLUMNS, encoder=encoder, backwards=REVERSE_ENCODING, batch_size=eval_batch_size, # workaround bc sequence.__len__ overwrites fit_generator arg steps_per_epoch=eval_steps, shuffle=SHUFFLE) # prepare log dictionaries job_dir += '/' + time.strftime("%Y%m%d-%H%M%S") try: os.makedirs(job_dir) except: print("ERROR: Directory 'job-dir' could not be created.") # workaround bc h5py cannot write to GCS # save to local filesystem, then copy over to GCS checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Learning rate scheduler callback --unused for the moment cb_learning_rate_scheduler = LearningRateScheduler(learning_rate_scheduler) # Detached model checkpoint callback to snapshot multi-gpu models detached_checkpoint = ModelCheckpointDetached(checkpoint_path, monitor='acc', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback, eval & copy checkpoints to gcs evaluation = ContinuousEval( eval_frequency=eval_frequency, eval_sequence=eval_sequence, # eval_generator=eval_generator, learning_rate=learning_rate, momentum=MOMENTUM, job_dir=job_dir, steps=eval_steps) # Tensorboard logs callback tblog = TensorBoard(log_dir=os.path.join(job_dir, 'tb-logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [ # cb_learning_rate_scheduler, detached_checkpoint, evaluation, tblog, ] # load model: with tf.device('/cpu:0'): conv_model = model.model_fn( maxlen=MAXLEN, vocab_size=encoder.vocab_size, conv_filters=CONV_FILTERS_SMALL, conv_kernels=CONV_KERNELS, # conv_padding=conv_padding, # conv_activation=conv_activation, max_pooling=MAX_POOLING, dense_output_units=DENSE_OUTPUT_UNITS_SMALL, # dense_activation=dense_activation, dropout_probs=DROPOUT_PROBS, output_cats=NUM_CLASSES, # output_activation=output_activation, # optimizer=optimizer, learning_rate=learning_rate, momentum=MOMENTUM, stddev=stddev, # loss=loss, # metrics=metrics ) if multi_gpu: # Replicate the model on multiple GPUs: parallel_model = multi_gpu_model(conv_model, gpus=gpus) parallel_model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=learning_rate, momentum=MOMENTUM), metrics=['categorical_accuracy']) with tf.device('/cpu:0'): # compile local model conv_model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=learning_rate, momentum=MOMENTUM), metrics=['categorical_accuracy']) conv_model.summary() if multi_gpu: parallel_model.fit_generator( callbacks=callbacks, # generator=train_generator, generator=train_sequence, steps_per_epoch=train_steps, epochs=num_epochs, workers=workers, # verbose: 0 = silent, 1 = progress bar, 2 = one line per epoch verbose=verbose) conv_model.set_weights(parallel_model.get_weights()) else: conv_model.fit_generator(callbacks=callbacks, generator=train_sequence, steps_per_epoch=train_steps, epochs=num_epochs, workers=workers, verbose=verbose) # workaround bc h5py cannot write to GCS # save to local filesystem, then copy over to GCS if job_dir.startswith("gs://"): conv_model.save(CONV_MODEL) copy_file_to_gcs(job_dir, CONV_MODEL) else: conv_model.save(os.path.join(job_dir, CONV_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(conv_model, os.path.join(job_dir, 'export'))
def train_and_evaluate(args): # confirm whether training datasets need to be created if args.create_data == True: import trainer.create_data_func as create_data_func logging.info('Begin creating datasets') for data_part in ['train', 'val', 'test']: create_data_func.create_data_func(data_part, args.project_id, args.bucket_name, args.dataset_id) logging.info('End creating datasets') # Create config file and store project id there so that model.py can read it. with open('config.py', 'w') as f: f.write("PROJECT_ID=\"{}\"\n".format(args.project_id)) f.write("BUCKET_NAME=\"{}\"\n".format(args.bucket_name)) # import after datasets are created as they are referenced immediately when this module is initiated import trainer.model as model # if new datasets are created, scaler also need to be created if args.create_data == True: import trainer.create_scaler_func as create_scaler_func logging.info('Begin fitting scaler') create_scaler_func.create_scaler_func(args.train_files, model.CSV_COLUMNS, model.LABEL_COLUMN, args.bucket_name, args.project_id) logging.info('End fitting scalers') # download the scaler if not path.exists('x_scaler'): logging.info('Downloading scaler') storage_client = storage.Client(project=args.project_id) bucket = storage_client.get_bucket(args.bucket_name) blob = bucket.blob('scalers/x_scaler') blob.download_to_filename('x_scaler') logging.info('Downloaded scaler') x_scaler = joblib.load('x_scaler') # build the model census_model = model.model_fn( learning_rate=args.learning_rate, num_deep_layers=args.num_deep_layers, first_deep_layer_size=args.first_deep_layer_size, first_wide_layer_size=args.first_wide_layer_size, wide_scale_factor=args.wide_scale_factor, dropout_rate=args.dropout_rate) logging.info(census_model.summary()) try: os.makedirs(args.job_dir) except: pass checkpoint_path = os.path.join(args.job_dir, CHECKPOINT_FILE_PATH) # Model checkpoint callback. checkpoint = ModelCheckpoint( checkpoint_path, monitor='val_mse', # 'mean_squared_error' verbose=1, period=args.checkpoint_epochs, save_best_only=True, mode='min') # Early stopping callback. early_stop = EarlyStopping(monitor='val_mse', patience=10) # 'mean_squared_error' # Tensorboard logs callback. tb_log = TensorBoard(log_dir=os.path.join(args.job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, early_stop, tb_log] # fit the model on the training set census_model.fit_generator( generator=model.generator_input(args.train_files, chunk_size=CHUNK_SIZE, project_id=args.project_id, bucket_name=args.bucket_name, x_scaler=x_scaler), steps_per_epoch=args.train_steps, epochs=args.num_epochs, callbacks=callbacks, validation_data=model.generator_input(args.eval_files, chunk_size=CHUNK_SIZE, project_id=args.project_id, bucket_name=args.bucket_name, x_scaler=x_scaler), validation_steps=args.eval_steps) # evaluate model on test set loss, mae, mse = census_model.evaluate_generator(model.generator_input( args.test_files, chunk_size=CHUNK_SIZE, project_id=args.project_id, bucket_name=args.bucket_name, x_scaler=x_scaler), steps=args.test_steps) logging.info('\nTest evaluation metrics[{:.2f}, {:.2f}, {:.2f}] {}'.format( loss, mae, mse, census_model.metrics_names)) # Unhappy hack to workaround h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if args.job_dir.startswith('gs://'): census_model.save(CENSUS_MODEL) copy_file_to_gcs(args.job_dir, CENSUS_MODEL) else: census_model.save(os.path.join(args.job_dir, CENSUS_MODEL)) # Convert the Keras model to TensorFlow SavedModel. model.to_savedmodel(census_model, os.path.join(args.job_dir, 'export'))
def run(target, cluster_spec, is_chief, args): """Runs the training and evaluation graph. Args: target (str): Tensorflow server target. cluster_spec: (cluster spec) Cluster specification. is_chief (bool): Boolean flag to specify a chief server. args (args): Input Arguments. """ # Calculate the number of hidden units hidden_units = [ max(2, int(args.first_layer_size * args.scale_factor**i)) for i in range(args.num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info('Created DNN hidden units {}'.format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( args.eval_files, num_epochs=None if args.eval_steps else 1, batch_size=args.eval_batch_size, shuffle=False ) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn( model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=args.learning_rate ) hooks = [EvaluationRunHook( args.job_dir, metric_dict, evaluation_graph, args.eval_frequency, eval_steps=args.eval_steps, )] else: hooks = [] # Create a new graph and specify that as default. with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers. # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue. features, labels = model.input_fn( args.train_files, num_epochs=args.num_epochs, batch_size=args.train_batch_size ) # Returns the training graph and global step tensor. train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=args.learning_rate ) # Creates a MonitoredSession for training. # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession(master=target, is_chief=is_chief, checkpoint_dir=args.job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (args.train_steps is None or step < args.train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(args.job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, args.job_dir, model.SERVING_INPUT_FUNCTIONS[args.export_format], hidden_units)
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, learning_rate, eval_frequency, num_epochs, checkpoint_epochs, gpus): # With severals GPU you use 2 models, un for training and other for store. # The first one is assigned to the CPU, # the other run in the GPU and is generated using multi_gpu_model if gpus <= 1: model_train = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE) model_save = model_train else: with tf.device("/cpu:0"): model_save = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE) model_train = multi_gpu_model(model_save, gpus=gpus) model.compile_model(model_save, learning_rate) print(model_save.summary()) model.compile_model(model_train, learning_rate) print(model_train.summary()) try: os.makedirs(job_dir) except: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, # then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=0, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir, steps=eval_steps) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] x, y = model.get_array_x_y(train_files, train_steps, WINDOWS_SIZE, NUM_CHARS) model_train.fit(x, y, epochs=num_epochs, callbacks=callbacks, batch_size=500) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, # then copy them over to GCS. if job_dir.startswith("gs://"): model_save.save(BEIRAS_MODEL) copy_file_to_gcs(job_dir, BEIRAS_MODEL) else: model_save.save(os.path.join(job_dir, BEIRAS_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(model_save, os.path.join(job_dir, 'export'))
def dispatch(train_files, eval_files, job_dir, learning_rate, eval_frequency, num_epochs, checkpoint_epochs): # setting the seed for reproducibility np.random.seed(13) # get all data and build labelencoder and onehotencoder full_dataset = model.get_all_data(train_files + eval_files) # convert values in categorical columns to numerical 0-n labelencoder_DayOfWeek = model.build_labelencoder('DayOfWeek', full_dataset) labelencoder_StoreType = model.build_labelencoder('StoreType', full_dataset) labelencoder_Assortment = model.build_labelencoder('Assortment', full_dataset) # NOTE: apply label encoders before build onehotencoder model.apply_labelencoder('DayOfWeek', labelencoder_DayOfWeek, full_dataset) model.apply_labelencoder('StoreType', labelencoder_StoreType, full_dataset) model.apply_labelencoder('Assortment', labelencoder_Assortment, full_dataset) # DayOfWeek should be considered as categorical data and not as numerical onehotencoder = model.build_onehotencoder( ['DayOfWeek', 'StoreType', 'Assortment'], full_dataset) #onehotencoder_DayOfWeek = model.build_onehotencoder_DayOfWeek(full_dataset) full_dataset = model.getOneHotEncodedData(onehotencoder, full_dataset) # NOTE: must be called after apply Label- and OneHot- Encoder scaler = model.build_scaler(full_dataset) # finally we can create our model input_data_shape = model.get_input_shape(full_dataset) forecast_model = model.model_fn(input_data_shape) try: os.makedirs(job_dir) except Exception as e: print(e) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = CHECKPOINT_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, verbose=1, period=checkpoint_epochs) # Continuous eval callback with ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir, scaler, labelencoder_DayOfWeek, labelencoder_StoreType, labelencoder_Assortment, onehotencoder) as evaluation: # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join( job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] x, y = model.load_features(train_files, scaler, labelencoder_DayOfWeek, labelencoder_StoreType, labelencoder_Assortment, onehotencoder) forecast_model.fit(x, y, epochs=num_epochs, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): forecast_model.save(MODEL_FILENAME) copy_file_to_gcs(job_dir, MODEL_FILENAME) else: forecast_model.save(os.path.join(job_dir, MODEL_FILENAME))
def run(target, cluster_spec, is_chief, train_steps, eval_steps, job_dir, train_files, eval_files, train_batch_size, eval_batch_size, learning_rate, eval_frequency, first_layer_size, num_layers, scale_factor, num_epochs, export_format): """Run the training and evaluation graph. Args: target (string): Tensorflow server target is_chief (bool): Boolean flag to specify a chief server train_steps (int): Maximum number of training steps eval_steps (int): Number of steps to run evaluation for at each checkpoint. if eval_steps is None, evaluation will run for 1 epoch. job_dir (string): Output dir for checkpoint and summary train_files (string): List of CSV files to read train data eval_files (string): List of CSV files to read eval data train_batch_size (int): Batch size for training eval_batch_size (int): Batch size for evaluation learning_rate (float): Learning rate for Gradient Descent eval_frequency (int): Run evaluation frequency every n training steps. Do not evaluate too frequently otherwise you will pay for performance and do not evaluate too in-frequently otherwise you will not know how soon to stop training. Use default values to start with first_layer_size (int): Size of the first DNN layer num_layers (int): Number of hidden layers in the DNN scale_factor (float): Decay rate for the size of hidden layers num_epochs (int): Maximum number of training data epochs on which to train export_format (str): One of 'JSON', 'CSV' or 'EXAMPLE'. The input format for the outputed saved_model binary. """ # Calculate the number of hidden units hidden_units = [ max(2, int(first_layer_size * scale_factor**i)) for i in range(num_layers) ] # If the server is chief which is `master` # In between graph replication Chief is one node in # the cluster with extra responsibility and by default # is worker task zero. We have assigned master as the chief. # # See https://youtu.be/la_M6bCV91M?t=1203 for details on # distributed TensorFlow and motivation about chief. if is_chief: tf.logging.info("Created DNN hidden units {}".format(hidden_units)) evaluation_graph = tf.Graph() with evaluation_graph.as_default(): # Features and label tensors features, labels = model.input_fn( eval_files, num_epochs=None if eval_steps else 1, batch_size=eval_batch_size, shuffle=False) # Accuracy and AUROC metrics # model.model_fn returns the dict when EVAL mode metric_dict = model.model_fn(model.EVAL, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) hooks = [ EvaluationRunHook( job_dir, metric_dict, evaluation_graph, eval_frequency, eval_steps=eval_steps, ) ] else: hooks = [] # Create a new graph and specify that as default with tf.Graph().as_default(): # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): # Features and label tensors as read using filename queue features, labels = model.input_fn(train_files, num_epochs=num_epochs, batch_size=train_batch_size) # Returns the training graph and global step tensor train_op, global_step_tensor = model.model_fn( model.TRAIN, features.copy(), labels, hidden_units=hidden_units, learning_rate=learning_rate) # Creates a MonitoredSession for training # MonitoredSession is a Session-like object that handles # initialization, recovery and hooks # https://www.tensorflow.org/api_docs/python/tf/train/MonitoredTrainingSession with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=job_dir, hooks=hooks, save_checkpoint_secs=20, save_summaries_steps=50) as session: # Global step to keep track of global number of steps particularly in # distributed setting step = global_step_tensor.eval(session=session) # Run the training graph which returns the step number as tracked by # the global step tensor. # When train epochs is reached, session.should_stop() will be true. while (train_steps is None or step < train_steps) and not session.should_stop(): step, _ = session.run([global_step_tensor, train_op]) # Find the filename of the latest saved checkpoint file latest_checkpoint = tf.train.latest_checkpoint(job_dir) # Only perform this if chief if is_chief: build_and_run_exports(latest_checkpoint, job_dir, model.SERVING_INPUT_FUNCTIONS[export_format], hidden_units)
def dispatch(train_prefix, validation_prefix, job_dir, learning_rate, num_epochs, checkpoint_epochs, lam, dropout, model_file): # download train data #train_tmp_prefix, val_tmp_prefix = download_mats(train_prefix, validation_prefix) train_tmp_prefix = train_prefix val_tmp_prefix = validation_prefix print(train_tmp_prefix, val_tmp_prefix) # download train data #validation_tmp_prefix = download_mats(validation_prefix) #train_x, train_y, cv_x, cv_y, input_shape = create_data(train_tmp_prefix) logger = logging.getLogger() sh = StreamHandler(stdout) logger.addHandler(sh) logger.setLevel(logging.INFO) logger.info('learning_rate=%s' % learning_rate) if model_file is not None: if model_file.startswith('gs://'): cmd = 'gsutil cp %s /tmp' % model_file subprocess.check_call(cmd.split()) real_model_file = '/tmp/%s' % model_file.split('/')[-1] else: real_model_file = model_file face_age_model = load_model(real_model_file, compile=False) face_age_model = model.compile_model(face_age_model, learning_rate) else: face_age_model = model.model_fn(learning_rate, lam, dropout) try: os.makedirs(job_dir) except Exception: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to # GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) verbose = 1 multi = False num_worker = 1 else: verbose = 2 multi = False num_worker = 1 #multiprocessing.cpu_count() # # meta_data = get_meta(train_files) # indexes = [i for i in range(len(meta_data))] # random.shuffle(indexes) # meta_data = meta_data.loc[indexes].reset_index(drop=True) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback val_datasequence = FileDataSequence(val_tmp_prefix) # evaluation = ContinuousEval(eval_frequency, # # validation_tmp_prefix, # val_datasequence, # learning_rate, # job_dir, # ) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, tblog] train_data_sequence = FileDataSequence(train_tmp_prefix) #x_train, y_train = train_data_sequence.__getitem__(0) # test_data_sequence = DataSequence( # validation_tmp_prefix # ) face_age_model.fit_generator( # x_train, y_train, #model.generator_input(train_files, chunk_size=CHUNK_SIZE), train_data_sequence, validation_data=val_datasequence, validation_steps=val_datasequence.length, steps_per_epoch=train_data_sequence.length, verbose=verbose, epochs=num_epochs, callbacks=callbacks) # plot_history(history) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to # GCS. if job_dir.startswith("gs://"): face_age_model.save(FACE_AGE_MODEL) copy_file_to_gcs(job_dir, FACE_AGE_MODEL) else: face_age_model.save(os.path.join(job_dir, FACE_AGE_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(face_age_model, os.path.join(job_dir, 'export'))