Example #1
0
    def on_epoch_begin(self, epoch, logs={}):
        if epoch > 0 and epoch % self.eval_frequency == 0:

            # Unhappy hack to work around h5py not being able to write to GCS.
            # Force snapshots and saves to local filesystem, then copy them over to GCS.
            model_path_glob = 'checkpoint.*'
            if not self.job_dir.startswith("gs://"):
                model_path_glob = os.path.join(self.job_dir, model_path_glob)
            checkpoints = glob.glob(model_path_glob)
            if len(checkpoints) > 0:
                checkpoints.sort()
                census_model = load_model(checkpoints[-1])
                census_model = model.compile_model(census_model,
                                                   self.learning_rate)
                loss, acc = census_model.evaluate_generator(
                    model.generator_input(self.eval_files,
                                          chunk_size=CHUNK_SIZE),
                    steps=self.steps)
                print '\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format(
                    epoch, loss, acc, census_model.metrics_names)
                if self.job_dir.startswith("gs://"):
                    copy_file_to_gcs(self.job_dir, checkpoints[-1])
            else:
                print '\nEvaluation epoch[{}] (no checkpoints found)'.format(
                    epoch)
Example #2
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             first_layer_size, num_layers, scale_factor, eval_num_epochs,
             num_epochs, checkpoint_epochs):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    checkpoint_path = FILE_PATH
    if not job_dir.startswith("gs://"):
        checkpoint_path = os.path.join(job_dir, checkpoint_path)

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency,
                                eval_files,
                                learning_rate,
                                job_dir,
                                steps=train_steps)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    callbacks = [checkpoint, evaluation, tblog]

    census_model.fit_generator(model.generator_input(train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=train_steps,
                               epochs=num_epochs,
                               callbacks=callbacks)

    # Unhappy hack to work around h5py not being able to write to GCS.
    # Force snapshots and saves to local filesystem, then copy them over to GCS.
    if job_dir.startswith("gs://"):
        census_model.save(CENSUS_MODEL)
        copy_file_to_gcs(job_dir, CENSUS_MODEL)
    else:
        census_model.save(os.path.join(job_dir, CENSUS_MODEL))

    # Convert the Keras model to TensorFlow SavedModel
    model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
Example #3
0
 def on_epoch_begin(self, epoch, logs={}):
   if epoch > 0 and epoch % self.eval_frequency == 0:
     checkpoints = glob.glob(os.path.join(self.job_dir, 'checkpoint.*'))
     checkpoints.sort()
     census_model = load_model(checkpoints[-1])
     census_model = model.compile_model(census_model, self.learning_rate)
     loss, acc = census_model.evaluate_generator(
         model.generator_input(self.eval_files, chunk_size=CHUNK_SIZE),
         steps=self.steps)
     print('\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format(
         epoch, loss, acc, census_model.metrics_names))
Example #4
0
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps,
             train_batch_size, eval_batch_size, learning_rate, eval_frequency,
             first_layer_size, num_layers, scale_factor, eval_num_epochs,
             num_epochs, checkpoint_epochs):
    census_model = model.model_fn(INPUT_SIZE, CLASS_SIZE)

    try:
        os.makedirs(job_dir)
    except:
        pass

    # Model checkpoint callback
    checkpoint = keras.callbacks.ModelCheckpoint(os.path.join(
        job_dir, FILE_PATH),
                                                 monitor='val_loss',
                                                 verbose=1,
                                                 period=checkpoint_epochs,
                                                 mode='max')

    # Continuous eval callback
    evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate,
                                job_dir)

    # Tensorboard logs callback
    tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                        histogram_freq=0,
                                        write_graph=True,
                                        embeddings_freq=0)

    # TODO: This needs to be fixed in h5py so that writes to GCS are possible
    # Don't attempt to create checkpoints on Cloud ML Engine for now because
    # h5py doesn't come with native GCS write capability
    if job_dir.startswith('gs://'):
        callbacks = [evaluation, tblog]
    else:
        callbacks = [checkpoint, evaluation, tblog]

    start_time = time.time()

    census_model.fit_generator(model.generator_input(train_files,
                                                     chunk_size=CHUNK_SIZE),
                               steps_per_epoch=train_steps,
                               epochs=num_epochs,
                               callbacks=callbacks)

    print "\nTime used.", time.time() - start_time

    census_model.save(os.path.join(job_dir, CENSUS_MODEL))