def fn_trainable(config, checkpoint_dir=None): if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp: state = json.load(fp) else: state = {"internal_iter": 0} for i in range(state["internal_iter"], config["max_iterations"]): state["internal_iter"] = i time.sleep(config["sleep_time"]) if i % config["checkpoint_freq"] == 0: with tune.checkpoint_dir(step=i) as cd: with open(os.path.join(cd, "checkpoint.json"), "wt") as fp: json.dump(state, fp) tune.report( score=i * 10 * config["score_multiplied"], internal_iter=state["internal_iter"], )
def _on_step(self): sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=False, deterministic=self.deterministic, return_episode_rewards=True) episode_reward_mean, std_reward = np.mean( episode_rewards), np.std(episode_rewards) mean_ep_length, std_ep_length = np.mean( episode_lengths), np.std(episode_lengths) report( episode_reward_mean=episode_reward_mean, std_reward=std_reward, mean_ep_length=mean_ep_length, std_ep_length=std_ep_length )
def tune_function(config, checkpoint_dir=None): trainer = Trainer( backend=backend_config, num_workers=num_workers, use_gpu=use_gpu, resources_per_worker=resources_per_worker, ) trainer.start() iterator = trainer.run_iterator( train_func, config, dataset=dataset, checkpoint=checkpoint_dir ) for results in iterator: first_worker_results = results[0] tune.report(**first_worker_results) trainer.shutdown()
def train(config, checkpoint=None): step = 0 if checkpoint: with open(checkpoint) as f: step = json.loads(f.read())["timestep"] for timestep in range(step, 100): v = np.tanh(float(timestep) / config.get("width", 1)) v *= config.get("height", 1) if timestep % 3 == 0: checkpoint_dir = tune.make_checkpoint_dir(step=timestep) path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"timestep": timestep})) tune.save_checkpoint(path) # Here we use `episode_reward_mean`, but you can also report other # objectives such as loss or accuracy. tune.report(episode_reward_mean=v)
def eval_single_epoch(model: torch.nn.Module, loss_function: torch.nn.Module, data_loader: torch.utils.data.DataLoader): # switch to evaluate mode model.eval() accuracy_total = 0 for data in data_loader: X, y = data X, y = X.to(device), y.to(device) output = model(X) #.view(-1, 4096) loss = loss_function(output, y) accuracy_total += accuracy(y, output) accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset) ####### tune ############ tune.report(mean_accuracy=accuracy_avg) return {'Eval epoch accuracy ': accuracy_avg}
def MockTrainingFuncSync(config, checkpoint_dir=None): iter = 0 if checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") with open(checkpoint_path, "rb") as fp: a, iter = pickle.load(fp) a = config["a"] # Use the new hyperparameter if perturbed. while True: iter += 1 with tune.checkpoint_dir(step=iter) as checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") with open(checkpoint_path, "wb") as fp: pickle.dump((a, iter), fp) # Score gets better every iteration. time.sleep(1) tune.report(mean_accuracy=iter + a, a=a)
def train(config): import torch import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") mode = config["mode"] net = Net(mode).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) optimizer = hvd.DistributedOptimizer(optimizer) num_steps = 5 print(hvd.size()) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across slots, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) start = time.time() x_max = config["x_max"] for step in range(1, num_steps + 1): features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device) if mode == "square": labels = sq(features) else: labels = qu(features) optimizer.zero_grad() outputs = net(features) loss = torch.nn.MSELoss()(outputs, labels) loss.backward() optimizer.step() time.sleep(0.1) tune.report(loss=loss.item()) total = time.time() - start print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
def train_convnet(config, checkpoint_dir=None): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If checkpoint_dir is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if checkpoint_dir: print("Loading from checkpoint.") path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. with tune.checkpoint_dir(step=step) as checkpoint_dir: # Then create a checkpoint file in this directory. path = os.path.join(checkpoint_dir, "checkpoint") # Save state to checkpoint file. # No need to save optimizer for SGD. torch.save( { "step": step, "model_state_dict": model.state_dict(), "mean_accuracy": acc, }, path, ) step += 1 tune.report(mean_accuracy=acc)
def training_function(config): master = A3CMaster() master.learning_rate = config["learning_rate"] master.beta = config["beta"] master.gamma = config["gamma"] '''worker = A3CWorker(master.master_model, master.optimizer, 0, master.folder, master.beta, master.gamma, opponent_model_path=master.opponent_model_path) worker.run()''' """a3c_workers = [A3CWorker(master.master_model, master.optimizer, worker_id, master.folder, master.beta, master.gamma, opponent_model_path=master.opponent_model_path) for worker_id in range(2)] for i, worker in enumerate(a3c_workers): worker.start() [worker.join() for worker in a3c_workers]""" master.train() agent_test = AgentTest(master, RandomPlayer(), 0) reward = agent_test.play() # del a3c_workers del master tune.report(reward=reward)
def train(config, checkpoint_dir=None): step = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: step = json.loads(f.read())["timestep"] for timestep in range(step, 100): v = np.tanh(float(timestep) / config.get("width", 1)) v *= config.get("height", 1) # Checkpoint the state of the training every 3 steps # Note that this is only required for certain schedulers if timestep % 3 == 0: with tune.checkpoint_dir(step=timestep) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"timestep": timestep})) # Here we use `episode_reward_mean`, but you can also report other # objectives such as loss or accuracy. tune.report(episode_reward_mean=v)
def function_trainable(config): num_iters = int(config["num_iters"]) sleep_time = config["sleep_time"] score = config["score"] checkpoint_iters = config["checkpoint_iters"] checkpoint_size_b = config["checkpoint_size_b"] checkpoint_num_items = checkpoint_size_b // 8 # np.float64 for i in range(num_iters): if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \ i % checkpoint_iters == 0: with tune.checkpoint_dir(step=i) as dir: checkpoint_file = os.path.join(dir, "bogus.ckpt") checkpoint_data = np.random.uniform( 0, 1, size=checkpoint_num_items) with open(checkpoint_file, "wb") as fp: pickle.dump(checkpoint_data, fp) tune.report(score=i + score) time.sleep(sleep_time)
def train_figgie(config, checkpoint_dir=None): with open(r"/home/jmd6724/Documents/Figgie/ann/training_data.pickle", 'rb') as file: all_data = pickle.load(file) point = int(9 / 10 * len(all_data)) train_data = FiggieDataSet(all_data[:point]) test_data = FiggieDataSet(all_data[point:]) train_set = DataLoader(train_data, batch_size=64, shuffle=True) test_set = DataLoader(test_data, batch_size=64, shuffle=False) model = Net(config['l1'], config['l2'], config['l3']) loss_function = CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=config['lr']) for epoch in range(10): train(model, optimizer, loss_function, train_set) acc = test(model, test_set) tune.report(mean_accuracty=acc) if epoch % 5 == 0: torch.save(model.state_dict(), r"/home/jmd6724/Documents/Figgie/ann/model_{}_{}_{}_{}.pth" .format(config['l1'], config['l2'], config['l3'], acc))
def _run_experiment(self, config, hyperopt_dict): trial_id = tune.get_trial_id() gpus_ids = ray.get_gpu_ids() if gpus_ids: gpus = ",".join(str(id) for id in gpus_ids) else: gpus = None modified_config = substitute_parameters( copy.deepcopy(hyperopt_dict["config"]), config) hyperopt_dict["config"] = modified_config hyperopt_dict[ "experiment_name"] = f'{hyperopt_dict["experiment_name"]}_{trial_id}' hyperopt_dict["gpus"] = gpus train_stats, eval_stats = run_experiment(**hyperopt_dict) metric_score = self.get_metric_score(train_stats, eval_stats) tune.report(parameters=str(config), metric_score=metric_score, training_stats=str(train_stats), eval_stats=str(eval_stats))
def worker_function(inner_ex_config, config): """ Combines experiment config and auto-generated Ray config, and runs an iteration of inner_ex on that combined config. :param inner_ex_config: The current values of inner experiment config, including any modifications we might have made in an macro_experiment config update :param config: Config generated by Ray tune :return: """ from inner_experiment import inner_ex # Something that runs inner_ex by combining "base" config and ray experiment config inner_ex_dict = dict(inner_ex_config) merged_config = update(inner_ex_dict, config) # This will create an observer in the Tune trial directory, meaning that # inner experiment configs will be saved at <trial.log_dir>/1 observer = FileStorageObserver.create(tune.get_trial_dir()) inner_ex.observers.append(observer) ret_val = inner_ex.run(config_updates=merged_config) tune.report(accuracy=ret_val.result)
def train(config, checkpoint_dir=None): start = i = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp: state = json.load(fp) start = state["step"] + 1 for i in range(start, start + 10): with tune.checkpoint_dir(i) as d: with open(os.path.join(d, "checkpoint.json"), "wt") as fp: json.dump({"step": i}, fp) tune.report(step=i) # These indicators will tell us if all trials saved their # checkpoints. with open(f"/cluster/shared/indicator.{tune.get_trial_id()}", "wt") as fp: fp.write("") # We continue training (without saving checkpoints) to make sure # that Tune's result handling is triggered (so that the # FailOnIndicator callback is invoked). time.sleep(6) tune.report(step=i + 1) time.sleep(6) if start == 0: # If this is the first round, we just sleep for some time # to make sure that the driver exits first (via the # FailOnIndicator) tune.report(step=i + 2) time.sleep(120)
def train_func(config): train_data = ray.get(data_id) val_data = ray.get(validation_data_id) config = convert_bayes_configs(config).copy() if not isinstance(model_builder, ModelBuilder): raise ValueError(f"You must input a ModelBuilder instance for model_builder") trial_model = model_builder.build(config) # no need to call build since it is called the first time fit_eval is called. # callbacks = [TuneCallback(tune_reporter)] # fit model best_reward = None for i in range(1, 101): result = trial_model.fit_eval(data=train_data, validation_data=val_data, mc=mc, metric=metric, **config) reward = result checkpoint_filename = "best.ckpt" # Save best reward iteration mode = Evaluator.get_metric_mode(metric) if mode == "max": has_best_reward = best_reward is None or reward > best_reward else: has_best_reward = best_reward is None or reward < best_reward if has_best_reward: best_reward = reward trial_model.save(checkpoint_filename) # Save to hdfs if remote_dir is not None: put_ckpt_hdfs(remote_dir, checkpoint_filename) report_dict = {"training_iteration": i, metric: reward, "checkpoint": checkpoint_filename, "best_" + metric: best_reward} tune.report(**report_dict)
def ray_fit(config): val_log_liks = [] splitter = KFold(n_splits=n_splits) for (train_ind, val_ind) in splitter.split(X=train_context, y=train_inputs): train_inputs_, train_context_ = train_inputs[ train_ind], train_context[train_ind] val_inputs_, val_context_ = train_inputs[ val_ind], train_context[val_ind] flow = cls(inputs_size=self.inputs_size, context_size=self.context_size, device=self.device, context_normalization=self.context_normalization, inputs_normalization=self.inputs_normalization, cat_context=self.cat_context, **config) flow.fit(train_inputs_, train_context_, False) val_log_liks.append( flow.log_prob(val_inputs_, val_context_).mean()) tune.report(log_lik=np.mean(val_log_liks))
def post_epoch_actions(trainer_instance: Engine): # evaluate model on validation set evaluator.run(val_loader) state_val_metrics = evaluator.state.metrics current_epoch: int = trainer_instance.state.epoch with tune.checkpoint_dir(current_epoch) as local_checkpoint_dir: # save model, optimizer and trainer checkpoints path = os.path.join(local_checkpoint_dir, "checkpoint") torch.save( (model.state_dict(), optimizer.state_dict(), trainer_instance.state_dict(), evaluator.state_dict()), path) # report validation scores to ray-tune report_dict: dict = { **state_val_metrics, "done": current_epoch == epochs } tune.report(**report_dict)
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format(results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v) ) # Remove extra memory cache of main process due to evaluation torch.cuda.empty_cache() self.step += 1 # Here we save a checkpoint. It is automatically registered with # Ray Tune and will potentially be passed as the `checkpoint_dir` # parameter in future iterations. with tune.checkpoint_dir(step=self.step) as checkpoint_dir: additional_state = {"iteration": int(self.trainer.iter)} Checkpointer( # Assume you want to save checkpoints together with logs/statistics self.trainer.model, checkpoint_dir, save_to_disk=True, optimizer=self.trainer.optimizer, scheduler=self.trainer.scheduler, ).save(name="checkpoint", **additional_state) metrics = dict(r1=results['Rank-1'], map=results['mAP'], score=(results['Rank-1'] + results['mAP']) / 2) tune.report(**metrics)
def train_breast_cancer(config): # Load dataset data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) # Split into train and test set train_x, test_x, train_y, test_y = train_test_split( data, labels, test_size=0.25) # Build input matrices for XGBoost train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) # Train the classifier bst = xgb.train( config, train_set, evals=[(test_set, "eval")], verbose_eval=False, callbacks=[XGBCallback]) # Predict labels for the test set preds = bst.predict(test_set) pred_labels = np.rint(preds) # Return prediction accuracy accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) tune.report(mean_accuracy=accuracy, done=True)
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
def train_mnist(config): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_loader, test_loader = get_data_loaders() model = ConvNet() model.to(device) optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) for _i in range(10): train(model, optimizer, train_loader, device=device) acc = test(model, test_loader, device=device) # When using WandbLogger, the metrics reported to tune are also logged in the W&B dashboard tune.report(mean_accuracy=acc) # @wandb_mixin enables logging custom metric using wandb.log() error_rate = 100 * (1 - acc) wandb.log({"error_rate": error_rate})
def nas_report(study, trial): best_session = study.best_trials[0] print("Trial stats (#{}): Loss={} Accuracy={}".format( trial.number, *(list(best_session.values)))) print("Best params so far (#{}): {}".format(best_session.number, best_session.params)) finished_trials = list( filter((lambda trial: trial.state.is_finished()), study.trials)) model_state = {} with tune.checkpoint_dir(step=best_session.number) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") model_state = torch.load(path) with tune.checkpoint_dir(step=trial.number) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((best_session.params, model_state), path) result_zip = zip(["loss", "accuracy"], list(best_session.values)) results = {p: v for p, v in result_zip} tune.report(**results)
def run_parameterised_experiment(config): # Hyperparameters trial_dir = tune.get_trial_dir() problem, method, other_config = config["main_params"] n_workers = config["n_workers"] experiment = CartpoleExperiment() experiment.nn_path = other_config[ "folder"] # nn_paths_cartpole[other_config["nn_path"]] experiment.tau = other_config["tau"] if other_config["template"] == 2: # octagon experiment.analysis_template = Experiment.octagon( experiment.env_input_size) elif other_config["template"] == 0: # box experiment.analysis_template = Experiment.box( experiment.env_input_size) else: _, template = experiment.get_template(1) experiment.analysis_template = template # standard experiment.n_workers = n_workers experiment.show_progressbar = False experiment.show_progress_plot = False # experiment.use_rounding = False experiment.save_dir = trial_dir experiment.update_progress_fn = update_progress elapsed_seconds, safe, max_t = experiment.run_experiment() safe_value = 0 if safe is None: safe_value = 0 elif safe: safe_value = 1 elif not safe: safe_value = -1 tune.report(elapsed_seconds=elapsed_seconds, safe=safe_value, max_t=max_t, done=True)
def ntk_experiment(config=None, checkpoint_dir=None, n_inputs=100, n_inits=100, repetitions=1000, input_chunk_size=50): """Compute error in LeCun/NTK initialization empirically.""" logging.basicConfig(level=logging.INFO) n_chunks = max(1, n_inputs // input_chunk_size) n_inputs = input_chunk_size * n_chunks with np_random_seed(): data_all = np.random.randn(n_inputs, 1) for init in range(n_inits): exp = Experiment() print("GPUs", tf.config.experimental.list_physical_devices("GPU")) print("GPU", tf.test.gpu_device_name()) for chunk in range(n_chunks): data = data_all[chunk * input_chunk_size:(chunk + 1) * input_chunk_size] out = exp.model_correct.predict(data) delta = exp.compute_error(data, repetitions=repetitions) for inp in range(input_chunk_size): tune.report({ 'input': data[inp, 0], 'out': out[inp, 0], 'delta_mean': np.mean(delta[inp]), 'delta_std': np.std(delta[inp]), 'n_init': init, 'n_inp': inp + chunk * input_chunk_size, 'inp_chunk': chunk }) del exp
def train_cifar_100(config): config = fill_config(config) # Data Setup train_loader, val_loader = get_data_loaders(round(config['batch_size'])) # Model Setup model = models.resnet18() model.fc = nn.Linear(512,100,bias=True) model = model.to(DEVICE) # Optimizer optimizer = optim.SGD( model.parameters(), lr=config["lr"], momentum=config["momentum"], weight_decay=config["weight_decay"] ) # LR Scheduler scheduler = optim.lr_scheduler.StepLR(optimizer,round(config['step']),gamma=cf.SCHEDULER_GAMMA) # Loss Criterion criterion = nn.CrossEntropyLoss() while True: train_acc = train(model, optimizer, criterion, train_loader) val_acc = test(model, val_loader) scheduler.step() # Send the current training result back to Tune print('[log] time: ', time() - START_TIME) print('[log] ram: ', psutil.virtual_memory().used / (1024 ** 3) - START_RAM) print('[log] val_acc: ', val_acc) print('[log] train_acc: ', train_acc) tune.report(mean_accuracy=val_acc,train_acc=train_acc)
def mnist_pt_objective(config): model = NumberNet(config) trainer = pl.Trainer(max_epochs=config['epochs'], gpus=1, auto_select_gpus=True) trainer.fit(model) trainer.test(model) tune.report(test_loss=model.test_loss) fmodel = fb.PyTorchModel(model, bounds=(0, 1)) images, labels = fb.utils.samples(fmodel, dataset='mnist', batchsize=config['batch_size']) clean_accuracy = fb.utils.accuracy(fmodel, images, labels) attack = fb.attacks.SaltAndPepperNoiseAttack() epsilons = [ 0.0, 0.0002, 0.0005, 0.0008, 0.001, 0.0015, 0.002, 0.003, 0.01, 0.1, 0.3, 0.5, 1.0, ] raw_advs, clipped_advs, success = attack(fmodel, images, labels, epsilons=epsilons) robust_accuracy = 1 - success.cpu().numpy().astype(float).flatten().mean( axis=-1) # res test[0] reports the loss from the evaluation, res_test[1] reports the accuracy tune.report(robust_acc=robust_accuracy) return robust_accuracy
def test(net): env = gym.make(env_name) performance = [] for _ in range(20): obs = env.reset() next_obs = None reward = 0 total_reward = 0 done = False while not done: if next_obs is not None: obs = next_obs obs = torch.tensor(obs).float() action = action_decide(net, obs) next_obs, reward, done, info = env.step(action) total_reward += reward # env.render() if done: performance.append(total_reward) performance = mean(performance) tune.report(reward_avg=performance)
def train(self, epochs, global_step=0): for epoch in range(global_step, epochs + global_step): self.model.train() all_probs = [] all_labels = [] running_loss = 0.0 for data, labels in self.train_loader: data, labels = data.to(self.device), labels.to(self.device) if self.after_load_cb: data = self.after_load_cb(data) self.optimizer.zero_grad() outputs = self.model(data) loss = self.loss_fn(outputs, labels) running_loss += loss.item() probs = F.softmax(outputs, dim=1) all_probs.append(probs.cpu().detach().numpy()) all_labels.append(labels.cpu().numpy()) loss.backward() self.optimizer.step() all_probs = np.concatenate(all_probs) all_labels = np.concatenate(all_labels) train_metrics = self.calc_metrics( all_probs, all_labels, running_loss / len(self.train_loader.dataset), "train") val_metrics = self.evaluate() self.save(epoch) metrics = {**train_metrics, **val_metrics} tune.report(**metrics)
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format( results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v)) # Remove extra memory cache of main process due to evaluation torch.cuda.empty_cache() self.step += 1 # Here we save a checkpoint. It is automatically registered with # RayTune and will potentially be passed as the `checkpoint_dir` # parameter in future iterations. with tune.checkpoint_dir(step=self.step) as checkpoint_dir: additional_state = {"epoch": int(self.trainer.epoch)} # Change path of save dir where tune can find self.trainer.checkpointer.save_dir = checkpoint_dir self.trainer.checkpointer.save(name="checkpoint", **additional_state) metrics = dict(r1=results["Rank-1"], map=results["mAP"], score=(results["Rank-1"] + results["mAP"]) / 2) tune.report(**metrics)