def _get_temp_gcs_path(): path = f"gs://{GCS_TEST_BUCKET}/" + "".join( random.choice(string.ascii_lowercase) for i in range(16) ) gfile.mkdir(path) yield path + "/file.name" gfile.rmtree(path)
def test_get_policy_model_files(self): output_dir = self.get_temp_dir() def write_policy_model_file(epoch): with gfile.GFile( ppo.get_policy_model_file_from_epoch(output_dir, epoch), 'w') as f: f.write('some data') epochs = [200, 100, 300] # 300, 200, 100 expected_policy_model_files = [ output_dir + '/model-000300.pkl', output_dir + '/model-000200.pkl', output_dir + '/model-000100.pkl', ] for epoch in epochs: write_policy_model_file(epoch) policy_model_files = ppo.get_policy_model_files(output_dir) self.assertEqual(expected_policy_model_files, policy_model_files) gfile.rmtree(output_dir)
def maybe_load_checkpoint(logdir, optimizer, clobber_checkpoint=False): if not clobber_checkpoint: if has_checkpoint(logdir): print("Loading checkpoint from %s" % logdir) optimizer = checkpoints.restore_checkpoint(logdir, optimizer) print("Checkpoint loaded from step %d" % optimizer.state.step) else: if gfile.isdir(logdir): gfile.rmtree(logdir) return optimizer
def remove(self, path: str) -> bool: try: if not gfile.isdir(path): os.remove(path) return True if gfile.isdir(path): gfile.rmtree(path) return True except Exception as e: # pylint: disable=broad-except logging.error('Error during remove %s', str(e)) return False
def prepare_dirs(recreate=False): """Prepare config dirs When recreate is True, if previous execution exists, remove them and recreate. When recreate is False, remain previous execution. """ experiment_dir = environment.EXPERIMENT_DIR tensorboard_dir = environment.TENSORBOARD_DIR checkpoints_dir = environment.CHECKPOINTS_DIR if recreate: message = """ Delete and recreate these dirs: experiment_dir: {experiment_dir} tensorboard_dir: {tensorboard_dir} checkpoints_dir: {checkpoints_dir} """.format(experiment_dir=experiment_dir, tensorboard_dir=tensorboard_dir, checkpoints_dir=checkpoints_dir) else: message = """ Create these dirs if the dirs dont exist: experiment_dir: {experiment_dir} tensorboard_dir: {tensorboard_dir} checkpoints_dir: {checkpoints_dir} """.format(experiment_dir=experiment_dir, tensorboard_dir=tensorboard_dir, checkpoints_dir=checkpoints_dir) print(message) if recreate: if gfile.exists(experiment_dir): gfile.rmtree(experiment_dir) if gfile.exists(tensorboard_dir): gfile.rmtree(tensorboard_dir) if gfile.exists(checkpoints_dir): gfile.rmtree(checkpoints_dir) if not gfile.exists(experiment_dir): gfile.makedirs(experiment_dir) if not gfile.exists(tensorboard_dir): gfile.makedirs(tensorboard_dir) if not gfile.exists(checkpoints_dir): gfile.makedirs(checkpoints_dir)
def tmp_dir(self): tmp = tempfile.mkdtemp(dir=self.get_temp_dir()) yield tmp gfile.rmtree(tmp)
def tearDown(self): super(TestSklearnModel, self).tearDown() self.model.clean_up() if gfile.exists(self.gcs_path): gfile.rmtree(self.gcs_path)
def rmtree(path: str): gfile.rmtree(path)
def remove(self, path: str) -> bool: if not gfile.isdir(path): return os.remove(path) return gfile.rmtree(path)
def test_load_from_directory(self): output_dir = self.get_temp_dir() epochs = [0, 1, 2] env_ids = [0, 1, 2] temperatures = [0.5, 1.0] random_strings = ["a", "b"] # Write some trajectories. # There are 3x3x2x2 (36) trajectories, and of them 3x2x2 (12) are done. for epoch in epochs: for env_id in env_ids: for temperature in temperatures: for random_string in random_strings: traj = trajectory.Trajectory(time_steps=[ time_step.TimeStep( observation=epoch, done=(epoch == 0), raw_reward=1.0, processed_reward=1.0, action=env_id, info={}) ]) trajectory_file_name = trajectory.TRAJECTORY_FILE_FORMAT.format( epoch=epoch, env_id=env_id, temperature=temperature, r=random_string) with gfile.GFile( os.path.join(output_dir, trajectory_file_name), "w") as f: trajectory._get_pickle_module().dump(traj, f) # Load everything and check. bt = trajectory.BatchTrajectory.load_from_directory(output_dir) self.assertIsInstance(bt, trajectory.BatchTrajectory) self.assertEqual(36, bt.num_completed_trajectories) self.assertEqual(36, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=0) self.assertEqual(12, bt.num_completed_trajectories) self.assertEqual(12, bt.batch_size) # Get 100 trajectories, but there aren't any. bt = trajectory.BatchTrajectory.load_from_directory( output_dir, epoch=0, n_trajectories=100, max_tries=0) self.assertIsNone(bt) bt = trajectory.BatchTrajectory.load_from_directory( output_dir, epoch=0, temperature=0.5) self.assertEqual(6, bt.num_completed_trajectories) self.assertEqual(6, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory(output_dir, epoch=1) self.assertEqual(12, bt.num_completed_trajectories) self.assertEqual(12, bt.batch_size) # Constraints cannot be satisfied. bt = trajectory.BatchTrajectory.load_from_directory( output_dir, epoch=1, n_trajectories=100, up_sample=False, max_tries=0) self.assertIsNone(bt) # Constraints can be satisfied. bt = trajectory.BatchTrajectory.load_from_directory( output_dir, epoch=1, n_trajectories=100, up_sample=True, max_tries=0) self.assertEqual(100, bt.num_completed_trajectories) self.assertEqual(100, bt.batch_size) bt = trajectory.BatchTrajectory.load_from_directory( output_dir, epoch=1, n_trajectories=10) self.assertEqual(10, bt.num_completed_trajectories) self.assertEqual(10, bt.batch_size) gfile.rmtree(output_dir)
def run(workdir, data, strategy, architecture, n_layers, n_hiddens, activation, dropout_rate, l2_penalty, w_init_name, b_init_name, optimizer_name, learning_rate, n_epochs, epochs_between_checkpoints, init_stddev, cnn_stride, reduce_learningrate=False, verbosity=0): """Runs the whole training procedure.""" data_tr, data_te, dataset_info = data n_outputs = dataset_info['num_classes'] with strategy.scope(): optimizer = tf.keras.optimizers.get(optimizer_name) optimizer.learning_rate = learning_rate w_init = tf.keras.initializers.get(w_init_name) if w_init_name.lower() in ['truncatednormal', 'randomnormal']: w_init.stddev = init_stddev b_init = tf.keras.initializers.get(b_init_name) if b_init_name.lower() in ['truncatednormal', 'randomnormal']: b_init.stddev = init_stddev w_reg = tf.keras.regularizers.l2( l2_penalty) if l2_penalty > 0 else None if architecture == 'cnn' or architecture == 'cnnbn': model = build_cnn(n_layers, n_hiddens, n_outputs, dropout_rate, activation, cnn_stride, w_reg, w_init, b_init, architecture == 'cnnbn') elif architecture == 'fcn': model = build_fcn(n_layers, n_hiddens, n_outputs, dropout_rate, activation, w_reg, w_init, b_init, False) else: assert False, 'Unknown architecture: ' % architecture model.compile( optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True), metrics=['accuracy', 'mse', 'sparse_categorical_crossentropy']) # force the model to set input shapes and init weights for x, _ in data_tr: model.predict(x) if verbosity: model.summary() break ckpt = tf.train.Checkpoint(step=optimizer.iterations, optimizer=optimizer, model=model) ckpt_dir = os.path.join(workdir, 'temporary-ckpt') ckpt_manager = tf.train.CheckpointManager(ckpt, ckpt_dir, max_to_keep=3) if ckpt_manager.latest_checkpoint: logging.info('restoring checkpoint: %s', ckpt_manager.latest_checkpoint) print('restoring from %s' % ckpt_manager.latest_checkpoint) with strategy.scope(): ckpt.restore(ckpt_manager.latest_checkpoint) info = restore_results( os.path.join(workdir, '.intermediate-results.json')) print(info, flush=True) else: info = { 'steps': 0, 'start_time': time.time(), 'train_loss': dict(), 'train_accuracy': dict(), 'test_loss': dict(), 'test_accuracy': dict(), } info.update(_get_workunit_params()) # Add command line parameters. logger = None starting_epoch = len(info['train_loss']) cur_epoch = starting_epoch for cur_epoch in range(starting_epoch, n_epochs): if reduce_learningrate and cur_epoch == n_epochs - (n_epochs // 10): optimizer.learning_rate = learning_rate / 10 elif reduce_learningrate and cur_epoch == n_epochs - 2: optimizer.learning_rate = learning_rate / 100 # Train until we reach the criterion or get NaNs try: # always keep checkpoints for the first few epochs # we evaluate first and train afterwards so we have the at-init data if cur_epoch < 4 or (cur_epoch % epochs_between_checkpoints) == 0: eval_model(model, data_tr, data_te, info, logger, cur_epoch, workdir) model.fit(data_tr, epochs=1, verbose=verbosity) ckpt_manager.save() store_results(info, os.path.join(workdir, '.intermediate-results.json')) dt = time.time() - info['start_time'] logging.info('epoch %d (%3.2fs)', cur_epoch, dt) except tf.errors.InvalidArgumentError as e: # We got NaN in the loss, most likely gradients resulted in NaNs logging.info(str(e)) info['status'] = 'NaN' logging.info('Stop training because NaNs encountered') break eval_model(model, data_tr, data_te, info, logger, cur_epoch + 1, workdir) store_results(info, os.path.join(workdir, 'results.json')) # we don't need the temporary checkpoints anymore gfile.rmtree(os.path.join(workdir, 'temporary-ckpt')) gfile.remove(os.path.join(workdir, '.intermediate-results.json'))