def __init__(self,
                 data_path,
                 model_file,
                 s3_bucket,
                 epochs=50,
                 max_sample_records=500,
                 start_epoch=0,
                 restored_model=False,
                 restored_model_dir=None,
                 tf_timeline=False):
        self.data_path = data_path
        self.s3_bucket = format_s3_bucket(s3_bucket)
        self.s3_data_dir = format_s3_data_dir(self.s3_bucket)
        self.model_file = model_file
        self.n_epochs = int(epochs)
        self.max_sample_records = max_sample_records
        self.tf_timeline = tf_timeline

        # Always sync before training in case I ever train multiple models in parallel
        sync_from_aws(s3_path=self.s3_data_dir, local_path=self.data_path)

        if restored_model:
            self.model_dir = restored_model_dir
        else:
            self.tfboard_basedir = os.path.join(self.data_path,
                                                'tf_visual_data', 'runs')
            self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir)

        self.results_file = os.path.join(self.model_dir, 'results.txt')
        self.model_checkpoint_dir = os.path.join(self.model_dir, 'checkpoints')
        self.saver = tf.train.Saver()
        self.start_epoch = start_epoch
        self.restored_model = restored_model
        mkdir(self.model_checkpoint_dir)
Example #2
0
    def __init__(self,
                 data_path,
                 model_file,
                 s3_bucket,
                 epochs=50,
                 max_sample_records=500,
                 start_epoch=0,
                 restored_model=False,
                 restored_model_dir=None,
                 tf_timeline=False,
                 show_speed=False,
                 s3_sync=True):

        self.data_path = data_path
        self.s3_bucket = format_s3_bucket(s3_bucket)
        self.s3_data_dir = format_s3_data_dir(self.s3_bucket)
        self.model_file = model_file
        self.n_epochs = int(epochs)
        self.max_sample_records = max_sample_records
        self.tf_timeline = tf_timeline
        self.s3_sync = s3_sync

        # Always sync before training in case I ever train multiple models in parallel
        if self.s3_sync is True:  # You have the option to turn off the sync during development to save disk space
            sync_from_aws(s3_path=self.s3_data_dir, local_path=self.data_path)

        if restored_model:
            self.model_dir = restored_model_dir
        else:
            self.tfboard_basedir = os.path.join(self.data_path,
                                                'tf_visual_data', 'runs')
            self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir)

        self.results_file = os.path.join(self.model_dir, 'results.txt')
        self.speed_file = os.path.join(self.model_dir, 'speed.txt')
        self.model_checkpoint_dir = os.path.join(self.model_dir, 'checkpoints')
        self.saver = tf.train.Saver()
        self.start_epoch = start_epoch
        self.restored_model = restored_model
        mkdir(self.model_checkpoint_dir)

        # Prints batch processing speed, among other things
        self.show_speed = show_speed
Example #3
0
import os

args = parse_args()
data_path = args["datapath"]
epochs = args["epochs"]
model_dir = args["model_dir"]
show_speed = args['show_speed']
s3_bucket = format_s3_bucket(args['s3_bucket'])
s3_sync = args['s3_sync']

s3_data_dir = format_s3_data_dir(s3_bucket)
checkpoint_dir_path = os.path.join(model_dir, 'checkpoints')

# Sync with S3 if model or data (or both) are not available locally
if not file_is_stored_locally(checkpoint_dir_path):
    sync_from_aws(s3_path=s3_data_dir, local_path=data_path)

start_epoch = get_prev_epoch(checkpoint_dir_path)
graph_name = 'model-' + str(start_epoch)
checkpoint_file_path = os.path.join(checkpoint_dir_path, graph_name)
saver = tf.train.import_meta_graph(checkpoint_dir_path + "/" + graph_name +
                                   ".meta")
sess = tf.Session()

# Read the model into memory
saver.restore(sess, checkpoint_file_path)
graph = tf.get_default_graph()

# Restore values from previous run. These values should be same for all models
accuracy = graph.get_tensor_by_name("accuracy:0")
x = graph.get_tensor_by_name("x:0")

args = parse_args()
data_path = args["datapath"]
epochs = args["epochs"]
model_dir = args["model_dir"]
show_speed = args['show_speed']
s3_bucket = format_s3_bucket(args['s3_bucket'])
s3_sync = args['s3_sync']

s3_data_dir = format_s3_data_dir(s3_bucket)
checkpoint_dir_path = os.path.join(model_dir,'checkpoints')

# Sync with S3 if model or data (or both) are not available locally
if not file_is_stored_locally(checkpoint_dir_path):
    sync_from_aws(s3_path=s3_data_dir, local_path=data_path)

start_epoch = get_prev_epoch(checkpoint_dir_path)
graph_name = 'model-'+str(start_epoch)
checkpoint_file_path = os.path.join(checkpoint_dir_path,graph_name)
saver = tf.train.import_meta_graph(checkpoint_dir_path+"/"+graph_name+".meta")
sess = tf.Session()

# Read the model into memory
saver.restore(sess, checkpoint_file_path)
graph = tf.get_default_graph()

# Restore values from previous run. These values should be same for all models
accuracy = graph.get_tensor_by_name("accuracy:0")
x = graph.get_tensor_by_name("x:0")
y_ = graph.get_tensor_by_name("y_:0")