def __init__(self, data_path, model_file, s3_bucket, epochs=50, max_sample_records=500, start_epoch=0, restored_model=False, restored_model_dir=None, tf_timeline=False): self.data_path = data_path self.s3_bucket = format_s3_bucket(s3_bucket) self.s3_data_dir = format_s3_data_dir(self.s3_bucket) self.model_file = model_file self.n_epochs = int(epochs) self.max_sample_records = max_sample_records self.tf_timeline = tf_timeline # Always sync before training in case I ever train multiple models in parallel sync_from_aws(s3_path=self.s3_data_dir, local_path=self.data_path) if restored_model: self.model_dir = restored_model_dir else: self.tfboard_basedir = os.path.join(self.data_path, 'tf_visual_data', 'runs') self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir) self.results_file = os.path.join(self.model_dir, 'results.txt') self.model_checkpoint_dir = os.path.join(self.model_dir, 'checkpoints') self.saver = tf.train.Saver() self.start_epoch = start_epoch self.restored_model = restored_model mkdir(self.model_checkpoint_dir)
def __init__(self, data_path, model_file, s3_bucket, epochs=50, max_sample_records=500, start_epoch=0, restored_model=False, restored_model_dir=None, tf_timeline=False, show_speed=False, s3_sync=True): self.data_path = data_path self.s3_bucket = format_s3_bucket(s3_bucket) self.s3_data_dir = format_s3_data_dir(self.s3_bucket) self.model_file = model_file self.n_epochs = int(epochs) self.max_sample_records = max_sample_records self.tf_timeline = tf_timeline self.s3_sync = s3_sync # Always sync before training in case I ever train multiple models in parallel if self.s3_sync is True: # You have the option to turn off the sync during development to save disk space sync_from_aws(s3_path=self.s3_data_dir, local_path=self.data_path) if restored_model: self.model_dir = restored_model_dir else: self.tfboard_basedir = os.path.join(self.data_path, 'tf_visual_data', 'runs') self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir) self.results_file = os.path.join(self.model_dir, 'results.txt') self.speed_file = os.path.join(self.model_dir, 'speed.txt') self.model_checkpoint_dir = os.path.join(self.model_dir, 'checkpoints') self.saver = tf.train.Saver() self.start_epoch = start_epoch self.restored_model = restored_model mkdir(self.model_checkpoint_dir) # Prints batch processing speed, among other things self.show_speed = show_speed
import os args = parse_args() data_path = args["datapath"] epochs = args["epochs"] model_dir = args["model_dir"] show_speed = args['show_speed'] s3_bucket = format_s3_bucket(args['s3_bucket']) s3_sync = args['s3_sync'] s3_data_dir = format_s3_data_dir(s3_bucket) checkpoint_dir_path = os.path.join(model_dir, 'checkpoints') # Sync with S3 if model or data (or both) are not available locally if not file_is_stored_locally(checkpoint_dir_path): sync_from_aws(s3_path=s3_data_dir, local_path=data_path) start_epoch = get_prev_epoch(checkpoint_dir_path) graph_name = 'model-' + str(start_epoch) checkpoint_file_path = os.path.join(checkpoint_dir_path, graph_name) saver = tf.train.import_meta_graph(checkpoint_dir_path + "/" + graph_name + ".meta") sess = tf.Session() # Read the model into memory saver.restore(sess, checkpoint_file_path) graph = tf.get_default_graph() # Restore values from previous run. These values should be same for all models accuracy = graph.get_tensor_by_name("accuracy:0") x = graph.get_tensor_by_name("x:0")
args = parse_args() data_path = args["datapath"] epochs = args["epochs"] model_dir = args["model_dir"] show_speed = args['show_speed'] s3_bucket = format_s3_bucket(args['s3_bucket']) s3_sync = args['s3_sync'] s3_data_dir = format_s3_data_dir(s3_bucket) checkpoint_dir_path = os.path.join(model_dir,'checkpoints') # Sync with S3 if model or data (or both) are not available locally if not file_is_stored_locally(checkpoint_dir_path): sync_from_aws(s3_path=s3_data_dir, local_path=data_path) start_epoch = get_prev_epoch(checkpoint_dir_path) graph_name = 'model-'+str(start_epoch) checkpoint_file_path = os.path.join(checkpoint_dir_path,graph_name) saver = tf.train.import_meta_graph(checkpoint_dir_path+"/"+graph_name+".meta") sess = tf.Session() # Read the model into memory saver.restore(sess, checkpoint_file_path) graph = tf.get_default_graph() # Restore values from previous run. These values should be same for all models accuracy = graph.get_tensor_by_name("accuracy:0") x = graph.get_tensor_by_name("x:0") y_ = graph.get_tensor_by_name("y_:0")