def test_end_to_end(self): with open("keepsake.yaml", "w") as f: f.write('repository: "file://.keepsake"') with open("foo.txt", "w") as f: f.write("foo") with open("bar.txt", "w") as f: f.write("bar") experiment = keepsake.init(path=".", params={ "myint": 10, "myfloat": 0.1 }) with open("bar.txt", "w") as f: f.write("barrrr") experiment.checkpoint(path="bar.txt", metrics={"value": 123.45}) experiment = keepsake.experiments.get(experiment.id) self.assertEqual(10, experiment.params["myint"]) self.assertEqual(0.1, experiment.params["myfloat"]) self.assertEqual(123.45, experiment.checkpoints[0].metrics["value"]) foo = experiment.checkpoints[0].open("foo.txt") self.assertEqual("foo", foo.read().decode("utf-8")) bar = experiment.checkpoints[0].open("bar.txt") self.assertEqual("barrrr", bar.read().decode("utf-8")) with self.assertRaises(ImportError): experiment.plot("value")
def test_real_example(self) -> None: experiment = keepsake.init() checkpoints = [ (10000, 0.42, 1.34), (20000, 0.56, 0.17), (30000, 0.59363, 0.10), (40000, 0.58, 0.076), (50000, 0.61, 0.06), (60000, 0.61, 0.04), (70000, 0.61, 0.04), (80000, 0.61, 0.03), (90000, 0.62, 0.02), (100000, 0.61, 0.02), ] for step, eval_score, train_score in checkpoints: experiment.checkpoint( step=step, metrics={"eval": eval_score, "train": train_score}, primary_metric=("eval", "maximize"), ) self.assertEqual( train_model.checkpoints_to_delete(experiment), [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ) experiment.delete()
def test_only_one_checkpoint(self) -> None: experiment = keepsake.init(params={}) experiment.checkpoint( step=100, metrics={"eval_execution_f1": 0.7}, primary_metric=("eval_execution_f1", "maximize"), ) self.assertEqual(train_model.checkpoints_to_delete(experiment), []) experiment.delete()
def test_project_repository_version(temp_workdir): with open("keepsake.yaml", "w") as f: f.write("repository: file://.keepsake") experiment = keepsake.init() expected = """{"version":1}""" with open(".keepsake/repository.json") as f: assert f.read() == expected # no error on second init experiment = keepsake.init() with open(".keepsake/repository.json") as f: # repository.json shouldn't have changed assert f.read() == expected with open(".keepsake/repository.json", "w") as f: f.write("""{"version":2}""") with pytest.raises(IncompatibleRepositoryVersion): keepsake.init()
def test_is_running(temp_workdir): with open("keepsake.yaml", "w") as f: f.write("repository: file://.keepsake/") experiment = keepsake.init() heartbeat_path = f".keepsake/metadata/heartbeats/{experiment.id}.json" assert wait(lambda: os.path.exists(heartbeat_path), timeout_seconds=10, sleep_seconds=0.01) # Check whether experiment is running after heartbeats are started assert experiment.is_running() # Heartbeats stopped experiment.stop() assert not experiment.is_running()
def test_integration(self) -> None: experiment = keepsake.init() checkpoints = [ (10000, 0.42, 1.34), (20000, 0.56, 0.17), (30000, 0.59363, 0.10), (40000, 0.58, 0.076), (50000, 0.61, 0.06), (60000, 0.61, 0.04), (70000, 0.61, 0.04), (80000, 0.61, 0.03), (90000, 0.62, 0.02), (100000, 0.61, 0.02), ] for step, eval_score, train_score in checkpoints: experiment.checkpoint( step=step, metrics={"eval": eval_score, "train": train_score}, primary_metric=("eval", "maximize"), ) self.assertEqual( train_model.checkpoints_to_delete(experiment), [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000], ) with tempfile.TemporaryDirectory() as tmpdir: self._make_checkpoint_helper(tmpdir, 10000) self._make_checkpoint_helper(tmpdir, 20000) self._make_checkpoint_helper(tmpdir, 30000) self._make_checkpoint_helper(tmpdir, 40000) self._make_checkpoint_helper(tmpdir, 50000) self._make_checkpoint_helper(tmpdir, 60000) self._make_checkpoint_helper(tmpdir, 70000) self._make_checkpoint_helper(tmpdir, 80000) files = self._make_checkpoint_helper(tmpdir, 90000) files += self._make_checkpoint_helper(tmpdir, 100000) for step in train_model.checkpoints_to_delete(experiment): train_model.delete_checkpoint(tmpdir, step) self.assertEqual(set(os.listdir(tmpdir)), set(files)) experiment.delete()
def train(learning_rate, num_epochs): # highlight-start # Create an "experiment". This represents a run of your training script. # It saves the training code at the given path and any hyperparameters. experiment = keepsake.init( path=".", # highlight-start params={ "learning_rate": learning_rate, "num_epochs": num_epochs }, ) # highlight-end print("Downloading data set...") iris = load_iris() train_features, val_features, train_labels, val_labels = train_test_split( iris.data, iris.target, train_size=0.8, test_size=0.2, random_state=0, stratify=iris.target, ) train_features = torch.FloatTensor(train_features) val_features = torch.FloatTensor(val_features) train_labels = torch.LongTensor(train_labels) val_labels = torch.LongTensor(val_labels) torch.manual_seed(0) model = nn.Sequential( nn.Linear(4, 15), nn.ReLU(), nn.Linear(15, 3), ) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss() for epoch in range(num_epochs): model.train() optimizer.zero_grad() outputs = model(train_features) loss = criterion(outputs, train_labels) loss.backward() optimizer.step() with torch.no_grad(): model.eval() output = model(val_features) acc = (output.argmax(1) == val_labels).float().sum() / len(val_labels) print( "Epoch {}, train loss: {:.3f}, validation accuracy: {:.3f}".format( epoch, loss.item(), acc)) torch.save(model, "model.pth") # highlight-start # Create a checkpoint within the experiment. # This saves the metrics at that point, and makes a copy of the file # or directory given, which could weights and any other artifacts. experiment.checkpoint( path="model.pth", step=epoch, metrics={ "loss": loss.item(), "accuracy": acc }, primary_metric=("loss", "minimize"), )
def main(unused_argv: Any) -> None: tf.logging.info("Saving model saves and results to " + FLAGS.model_dir) global_seed(42) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train`, `do_eval` must be True.") config = model_config.load_config(FLAGS.config) if FLAGS.do_train: tf.logging.info("Training with train filenames: " + str(FLAGS.training_filename)) # Training allows noisy examples so do not use clean output vocab model_fn = model_builder.build_model_fn(config, FLAGS.output_vocab_filepath, clean_output_vocab_path="") # region training if FLAGS.do_train: # for keepsake CLI (helps track experiment results) experiment = keepsake.init(params={ "learning_rate": config.training_options.optimizer_learning_rate, "batch_size": config.training_options.batch_size, "training_steps": config.training_options.training_steps, "eval_batch_size": FLAGS.eval_batch_size, "training_data": FLAGS.training_filename, "eval_data": FLAGS.eval_filename, }, ) train_input_fn = input_pipeline.create_training_input_fn( config, FLAGS.tf_examples_dir, [name for name in FLAGS.training_filename if name], ) train_features, train_labels = train_input_fn() train_model = model_fn(train_features, train_labels, tf.estimator.ModeKeys.TRAIN) tf.get_variable_scope().reuse_variables() inference_config = inference.Config( FLAGS.eval_dataset_name, FLAGS.eval_splits.split(","), FLAGS.output_vocab_filepath, FLAGS.clean_output_vocab_filepath, FLAGS.eval_beam_size, FLAGS.using_abstract_sql, FLAGS.database_directory, FLAGS.empty_database_directory, FLAGS.original_data_directory, model_config.load_config(FLAGS.config), ) saver = tf.train.Saver(max_to_keep=None) global_step = 0 checkpoint = checkpoint_path(FLAGS.model_dir, global_step) validation_query_cache: Dict[str, Any] = {} with tf.Session() as init_sess: init_sess.run(tf.global_variables_initializer()) saver.save(init_sess, checkpoint) while global_step < config.training_options.training_steps: # region training loop with tf.Session() as train_sess: tf.logging.info( "Training from step %s to step %s", global_step, global_step + FLAGS.steps_between_saves, ) saver.restore(train_sess, checkpoint) train_losses = [] for step in range(FLAGS.steps_between_saves): _, train_loss = train_sess.run( [train_model.train_op, train_model.loss]) train_losses.append(train_loss) if step % 100 == 0: tf.logging.info( "Step %s's training loss: %s", global_step + step, train_loss, ) train_loss = statistics.mean(train_losses) global_step += FLAGS.steps_between_saves checkpoint = checkpoint_path(FLAGS.model_dir, global_step) saver.save(train_sess, checkpoint) # endregion # region eval loop tf.logging.info("Evaluating checkpoint %s", checkpoint) examples = inference.load_tf_examples( os.path.join(FLAGS.tf_examples_dir, FLAGS.eval_filename)) random.shuffle(examples) tf.logging.info("Running inference on %s", FLAGS.eval_filename) predictions = inference.inference( examples, checkpoint, inference_config, ) examples_to_execute = get_examples_to_execute( predictions, inference_config) # Only update cache when it's empty should_update_cache = len(validation_query_cache) == 0 # only scholar is case sensitive case_sensitive = "scholar" not in FLAGS.eval_dataset_name.lower() results, validation_query_cache = official_evaluation.execute_predictions( instructions=examples_to_execute, cache_dict=validation_query_cache, case_sensitive=case_sensitive, verbose=False, update_cache=should_update_cache, ) metrics = official_evaluation.aggregate_metrics( results, FLAGS.use_empty_tables) tf.logging.info("Validation Results:\n\tExecution F1: %s", metrics.execution_f1) # endregion experiment.checkpoint( step=global_step, metrics={ "train_loss": train_loss, "eval_execution_f1": metrics.execution_f1, "eval_string_match": metrics.string_same, }, primary_metric=("eval_execution_f1", "maximize"), ) # region disk management for step in checkpoints_to_delete(experiment): assert ( step != global_step ), f"Can't delete step {step}; need it for next training epoch starting at step {global_step}" print(f"Deleting checkpoint {step}") delete_checkpoint(FLAGS.model_dir, step)
def test_init_and_checkpoint(temp_workdir): with open("keepsake.yaml", "w") as f: f.write("repository: file://.keepsake/") with open("train.py", "w") as fh: fh.write("print(1 + 1)") with open("README.md", "w") as fh: fh.write("Hello") # basic experiment experiment = keepsake.init(path=".", params={"learning_rate": 0.002}, disable_heartbeat=True) experiment_tar_path = ".keepsake/experiments/{}.tar.gz".format( experiment.id) wait( lambda: os.path.exists(experiment_tar_path), timeout_seconds=5, sleep_seconds=0.01, ) time.sleep(0.1) # wait for file to be written assert len(experiment.id) == 64 with open(".keepsake/metadata/experiments/{}.json".format( experiment.id)) as fh: metadata = json.load(fh) assert metadata["id"] == experiment.id assert metadata["params"] == {"learning_rate": 0.002} assert metadata["host"] == "" assert metadata["user"] != "" # FIXME: this is broken https://github.com/replicate/keepsake/issues/492 assert metadata["config"]["repository"].startswith("file://") assert metadata["command"] != "" assert metadata["path"] == "." assert metadata["python_version"] != "" assert len(metadata["python_packages"]) > 0 assert metadata["keepsake_version"] != "" with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(experiment_tar_path) as tar: tar.extractall(tmpdir) assert (open(os.path.join(tmpdir, experiment.id, "train.py")).read() == "print(1 + 1)") assert os.path.exists(os.path.join(tmpdir, experiment.id, "README.md")) # checkpoint with a file with open("weights", "w") as fh: fh.write("1.2kg") checkpoint = experiment.checkpoint(path="weights", step=1, metrics={"validation_loss": 0.123}) checkpoint_tar_path = ".keepsake/checkpoints/{}.tar.gz".format( checkpoint.id) wait( lambda: os.path.exists(checkpoint_tar_path), timeout_seconds=5, sleep_seconds=0.01, ) time.sleep(0.1) # wait for file to be written assert len(checkpoint.id) == 64 with open(".keepsake/metadata/experiments/{}.json".format( experiment.id)) as fh: metadata = json.load(fh) assert len(metadata["checkpoints"]) == 1 checkpoint_metadata = metadata["checkpoints"][0] assert checkpoint_metadata["id"] == checkpoint.id assert checkpoint_metadata["step"] == 1 assert checkpoint_metadata["metrics"] == {"validation_loss": 0.123} with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(checkpoint_tar_path) as tar: tar.extractall(tmpdir) assert open(os.path.join(tmpdir, checkpoint.id, "weights")).read() == "1.2kg" assert not os.path.exists( os.path.join(tmpdir, checkpoint.id, "train.py")) # checkpoint with a directory os.mkdir("data") with open("data/weights", "w") as fh: fh.write("1.3kg") checkpoint = experiment.checkpoint(path="data", step=1, metrics={"validation_loss": 0.123}) checkpoint_tar_path = ".keepsake/checkpoints/{}.tar.gz".format( checkpoint.id) wait( lambda: os.path.exists(checkpoint_tar_path), timeout_seconds=5, sleep_seconds=0.01, ) time.sleep(0.1) # wait for file to be written with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(checkpoint_tar_path) as tar: tar.extractall(tmpdir) assert (open(os.path.join(tmpdir, checkpoint.id, "data/weights")).read() == "1.3kg") assert not os.path.exists( os.path.join(tmpdir, checkpoint.id, "train.py")) # checkpoint with no path checkpoint = experiment.checkpoint(path=None, step=1, metrics={"validation_loss": 0.123}) # wait in case async process tries to create a path anyway time.sleep(0.5) with open(".keepsake/metadata/experiments/{}.json".format( experiment.id)) as fh: metadata = json.load(fh) assert metadata["checkpoints"][-1]["id"] == checkpoint.id assert not os.path.exists(".keepsake/checkpoints/{}.tar.gz".format( checkpoint.id)) # experiment with file experiment = keepsake.init(path="train.py", params={"learning_rate": 0.002}, disable_heartbeat=True) experiment_tar_path = ".keepsake/experiments/{}.tar.gz".format( experiment.id) wait( lambda: os.path.exists(experiment_tar_path), timeout_seconds=5, sleep_seconds=0.01, ) time.sleep(0.1) # wait for file to be written with tempfile.TemporaryDirectory() as tmpdir: with tarfile.open(experiment_tar_path) as tar: tar.extractall(tmpdir) assert (open(os.path.join(tmpdir, experiment.id, "train.py")).read() == "print(1 + 1)") assert not os.path.exists( os.path.join(tmpdir, experiment.id, "README.md")) # experiment with no path! experiment = keepsake.init(path=None, params={"learning_rate": 0.002}, disable_heartbeat=True) # wait in case async process tries to create a path anyway time.sleep(0.5) with open(".keepsake/metadata/experiments/{}.json".format( experiment.id)) as fh: metadata = json.load(fh) assert metadata["id"] == experiment.id assert metadata["params"] == {"learning_rate": 0.002} assert not os.path.exists(".keepsake/experiments/{}.tar.gz".format( experiment.id))
def test_init_without_config_file(temp_workdir): with pytest.raises(ConfigNotFound): keepsake.init()
def test_init_with_config_file(temp_workdir): with open("keepsake.yaml", "w") as f: f.write("repository: file://.keepsake/") experiment = keepsake.init() assert isinstance(experiment, Experiment) experiment.stop()
def on_pretrain_routine_start(self, trainer, pl_module): self.experiment = keepsake.init(path=".", params=self.params)