def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % self.eval_frequency == 0: # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them # over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() face_age_model = load_model(checkpoints[-1], compile=False) face_age_model = model.compile_model(face_age_model, self.learning_rate) # data_sequence = DataSequence( # self.validation_prefix) loss, acc, mae = face_age_model.evaluate_generator( self.data_sequence, steps=self.data_sequence.length) print( '\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}, {:.2f}] {}' .format(epoch, loss, acc, mae, face_age_model.metrics_names)) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print('\nEvaluation epoch[{}] (no checkpoints found)'.format( epoch))
def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % self.eval_frequency == 0: # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() CHURN_MODEL = load_model(checkpoints[-1]) CHURN_MODEL = model.compile_model(churn_model, self.learning_rate) loss, acc = churn_model.evaluate_generator( model.generator_input(self.eval_files, chunk_size=CHUNK_SIZE), steps=self.steps) print( '\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format( epoch, loss, acc, churn_model.metrics_names)) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print('\nEvaluation epoch[{}] (no checkpoints found)'.format( epoch))
def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % self.eval_frequency == 0: # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, # then copy them over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() beiras_model = load_model(checkpoints[-1]) beiras_model = model.compile_model(beiras_model, self.learning_rate) x_eval, y_eval = model.get_array_x_y(self.eval_files, self.steps, WINDOWS_SIZE, NUM_CHARS) loss, acc = beiras_model.evaluate(x_eval, y_eval) print '\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.\ format(epoch, loss, acc, beiras_model.metrics_names) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print '\nEvaluation epoch[{}] (no checkpoints found)'.\ format(epoch)
def on_epoch_begin(self, epoch, logs={}): if epoch > 0 and epoch % int(self.eval_frequency) == 0: # workaround bc h5py cannot write to GCS # save to local filesystem, then copy over to GCS model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() # select latest model checkpoint conv_model = load_model(checkpoints[-1]) conv_model = model.compile_model(conv_model, self.learning_rate, self.momentum) loss, acc = conv_model.evaluate_generator( # generator=self.eval_generator, generator=self.eval_sequence, steps=self.steps, # max_queue_size=10, # workers=1, # use_multiprocessing=False ) print('Evaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format( epoch, loss, acc, conv_model.metrics_names)) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print('Evaluation epoch[{}] (no checkpoints found)'.format( epoch))
def dispatch(data_file, job_dir, num_epochs): job_dir = create_job_dir(job_dir) nb_chars, embedding_matrix, x_train, y_train, x_val, y_val = \ model.get_training_data(data_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT, EMBEDDING_FILE_GCS) my_model = model.model_fn(nb_chars, embedding_matrix) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min') timestamp = str(time.time()) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), write_graph=True, embeddings_freq=0) callbacks = [checkpoint, tblog] my_model = model.compile_model(my_model) my_model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=num_epochs, batch_size=128, callbacks=callbacks) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. if job_dir.startswith("gs://"): my_model.save(MY_MODEL_NAME) copy_file_to_gcs(job_dir, MY_MODEL_NAME) else: my_model.save(os.path.join(job_dir, MY_MODEL_NAME)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(my_model, os.path.join(job_dir, 'export'))
def on_epoch_end(self, epoch, logs={}): self.epochs_since_last_save += 1 if self.epochs_since_last_save >= self.eval_frequency: self.epochs_since_last_save = 0 # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith("gs://"): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() forecast_model = load_model(checkpoints[-1]) forecast_model = model.compile_model(forecast_model) x, y = model.load_features(self.eval_files, self.scaler, self.labelencoder_DayOfWeek, self.labelencoder_StoreType, self.labelencoder_Assortment, self.onehotencoder) metrics = forecast_model.evaluate(x, y) print('\n*** Evaluation epoch[{}] metrics {} {}'.format( epoch, metrics, forecast_model.metrics_names)) y_hat = forecast_model.predict(x) y_hat = model.invert_scale_sales(y_hat, self.scaler) np.savetxt( os.path.join(self.job_dir, 'preds/yhat_{:06d}.txt'.format(epoch)), y_hat) self.tf_logger.append(metrics_dict={ name: value for (name, value) in zip(forecast_model.metrics_names, metrics) }, epoch=epoch) if self.job_dir.startswith("gs://"): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print( '\n*** Evaluation epoch[{}] (no checkpoints found)'.format( epoch))
def on_epoch_begin(self, epoch, logs={}): """Compile and save model.""" if epoch > 0 and epoch % self.eval_frequency == 0: # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to GCS. model_path_glob = 'checkpoint.*' if not self.job_dir.startswith('gs://'): model_path_glob = os.path.join(self.job_dir, model_path_glob) checkpoints = glob.glob(model_path_glob) if len(checkpoints) > 0: checkpoints.sort() census_model = load_model(checkpoints[-1]) census_model = model.compile_model(census_model, self.learning_rate) loss, acc = census_model.evaluate_generator( model.generator_input(self.eval_files, chunk_size=CHUNK_SIZE), steps=self.steps) print('\nEvaluation epoch[{}] metrics[{:.2f}, {:.2f}] {}'.format( epoch, loss, acc, census_model.metrics_names)) if self.job_dir.startswith('gs://'): copy_file_to_gcs(self.job_dir, checkpoints[-1]) else: print('\nEvaluation epoch[{}] (no checkpoints found)'.format(epoch))
def dispatch(train_prefix, validation_prefix, job_dir, learning_rate, num_epochs, checkpoint_epochs, lam, dropout, model_file): # download train data #train_tmp_prefix, val_tmp_prefix = download_mats(train_prefix, validation_prefix) train_tmp_prefix = train_prefix val_tmp_prefix = validation_prefix print(train_tmp_prefix, val_tmp_prefix) # download train data #validation_tmp_prefix = download_mats(validation_prefix) #train_x, train_y, cv_x, cv_y, input_shape = create_data(train_tmp_prefix) logger = logging.getLogger() sh = StreamHandler(stdout) logger.addHandler(sh) logger.setLevel(logging.INFO) logger.info('learning_rate=%s' % learning_rate) if model_file is not None: if model_file.startswith('gs://'): cmd = 'gsutil cp %s /tmp' % model_file subprocess.check_call(cmd.split()) real_model_file = '/tmp/%s' % model_file.split('/')[-1] else: real_model_file = model_file face_age_model = load_model(real_model_file, compile=False) face_age_model = model.compile_model(face_age_model, learning_rate) else: face_age_model = model.model_fn(learning_rate, lam, dropout) try: os.makedirs(job_dir) except Exception: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to # GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) verbose = 1 multi = False num_worker = 1 else: verbose = 2 multi = False num_worker = 1 #multiprocessing.cpu_count() # # meta_data = get_meta(train_files) # indexes = [i for i in range(len(meta_data))] # random.shuffle(indexes) # meta_data = meta_data.loc[indexes].reset_index(drop=True) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, period=checkpoint_epochs, mode='max') # Continuous eval callback val_datasequence = FileDataSequence(val_tmp_prefix) # evaluation = ContinuousEval(eval_frequency, # # validation_tmp_prefix, # val_datasequence, # learning_rate, # job_dir, # ) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, tblog] train_data_sequence = FileDataSequence(train_tmp_prefix) #x_train, y_train = train_data_sequence.__getitem__(0) # test_data_sequence = DataSequence( # validation_tmp_prefix # ) face_age_model.fit_generator( # x_train, y_train, #model.generator_input(train_files, chunk_size=CHUNK_SIZE), train_data_sequence, validation_data=val_datasequence, validation_steps=val_datasequence.length, steps_per_epoch=train_data_sequence.length, verbose=verbose, epochs=num_epochs, callbacks=callbacks) # plot_history(history) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, then copy them over to # GCS. if job_dir.startswith("gs://"): face_age_model.save(FACE_AGE_MODEL) copy_file_to_gcs(job_dir, FACE_AGE_MODEL) else: face_age_model.save(os.path.join(job_dir, FACE_AGE_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(face_age_model, os.path.join(job_dir, 'export'))
def dispatch(train_files, eval_files, job_dir, train_steps, eval_steps, learning_rate, eval_frequency, num_epochs, checkpoint_epochs, gpus): # With severals GPU you use 2 models, un for training and other for store. # The first one is assigned to the CPU, # the other run in the GPU and is generated using multi_gpu_model if gpus <= 1: model_train = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE) model_save = model_train else: with tf.device("/cpu:0"): model_save = model.model_fn(NUM_CHARS, window_size=WINDOWS_SIZE) model_train = multi_gpu_model(model_save, gpus=gpus) model.compile_model(model_save, learning_rate) print(model_save.summary()) model.compile_model(model_train, learning_rate) print(model_train.summary()) try: os.makedirs(job_dir) except: pass # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, # then copy them over to GCS. checkpoint_path = FILE_PATH if not job_dir.startswith("gs://"): checkpoint_path = os.path.join(job_dir, checkpoint_path) # Model checkpoint callback checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=0, period=checkpoint_epochs, mode='max') # Continuous eval callback evaluation = ContinuousEval(eval_frequency, eval_files, learning_rate, job_dir, steps=eval_steps) # Tensorboard logs callback tblog = keras.callbacks.TensorBoard(log_dir=os.path.join(job_dir, 'logs'), histogram_freq=0, write_graph=True, embeddings_freq=0) callbacks = [checkpoint, evaluation, tblog] x, y = model.get_array_x_y(train_files, train_steps, WINDOWS_SIZE, NUM_CHARS) model_train.fit(x, y, epochs=num_epochs, callbacks=callbacks, batch_size=500) # Unhappy hack to work around h5py not being able to write to GCS. # Force snapshots and saves to local filesystem, # then copy them over to GCS. if job_dir.startswith("gs://"): model_save.save(BEIRAS_MODEL) copy_file_to_gcs(job_dir, BEIRAS_MODEL) else: model_save.save(os.path.join(job_dir, BEIRAS_MODEL)) # Convert the Keras model to TensorFlow SavedModel model.to_savedmodel(model_save, os.path.join(job_dir, 'export'))