def train(config, checkpoint_dir=None): if checkpoint_dir: count = sum("checkpoint-" in path for path in os.listdir(checkpoint_dir)) assert count == 1, os.listdir(checkpoint_dir) for step in range(20): with tune.checkpoint_dir(step=step) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint-{}".format(step)) open(path, "a").close() tune.report(test=step)
def __call__( self, epoch: int, sym: mxnet.symbol.Symbol, arg: Dict[str, np.ndarray], aux: Dict[str, np.ndarray], ): if epoch % self._frequency != 0: return with tune.checkpoint_dir(step=epoch) as checkpoint_dir: save_checkpoint(os.path.join(checkpoint_dir, self._filename), epoch, sym, arg, aux)
def train(config, checkpoint_dir=None): itr = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "ckpt.log"), "r") as f: itr = int(f.read()) + 1 for i in range(itr, config["max_iter"]): with tune.checkpoint_dir(step=i) as checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "ckpt.log") with open(checkpoint_path, "w") as f: f.write(str(i)) tune.report(test=i, training_iteration=i)
def nas_report(study, trial): best_session = study.best_trials[0] print("Trial stats (#{}): Loss={} Accuracy={}".format( trial.number, *(list(best_session.values)))) print("Best params so far (#{}): {}".format(best_session.number, best_session.params)) finished_trials = list( filter((lambda trial: trial.state.is_finished()), study.trials)) model_state = {} with tune.checkpoint_dir(step=best_session.number) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") model_state = torch.load(path) with tune.checkpoint_dir(step=trial.number) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((best_session.params, model_state), path) result_zip = zip(["loss", "accuracy"], list(best_session.values)) results = {p: v for p, v in result_zip} tune.report(**results)
def save_state(self): with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir: self.args.output_dir = checkpoint_dir # This is the directory name that Huggingface requires. output_dir = os.path.join( self.args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): torch.save(self.current_optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.current_scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
def write_checkpoint(self, checkpoint: Dict): # Store the checkpoint_id in the file so that the Tune trial can be # resumed after failure or cancellation. checkpoint[TUNE_CHECKPOINT_ID] = self._latest_checkpoint_id # If inside a Tune Trainable, then checkpoint with Tune. with tune.checkpoint_dir(step=self._latest_checkpoint_id) as \ checkpoint_dir: path = Path(checkpoint_dir) # Use a standard file name so that we know which file to load # the checkpoint from. file_path = path.joinpath(TUNE_CHECKPOINT_FILE_NAME) with file_path.open("wb") as f: cloudpickle.dump(checkpoint, f)
def train_mnist(config, start_model=None, checkpoint_dir=None, num_epochs=10, use_gpus=False, data_fn=None, day=0): # Create model use_cuda = use_gpus and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = ConvNet(layer_size=config["layer_size"]).to(device) # Create optimizer optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=config["momentum"]) # Load checkpoint, or load start model if no checkpoint has been # passed and a start model is specified load_dir = None if checkpoint_dir: load_dir = checkpoint_dir elif start_model: load_dir = start_model if load_dir: model_state, optimizer_state = torch.load( os.path.join(load_dir, "checkpoint")) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) # Get full training datasets train_dataset, validation_dataset = data_fn(day=day) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True) validation_loader = torch.utils.data.DataLoader( validation_dataset, batch_size=config["batch_size"], shuffle=True) for i in range(num_epochs): train(model, optimizer, train_loader, device) acc = test(model, validation_loader, device) if i == num_epochs - 1: with tune.checkpoint_dir(step=i) as checkpoint_dir: torch.save((model.state_dict(), optimizer.state_dict()), os.path.join(checkpoint_dir, "checkpoint")) tune.report(mean_accuracy=acc, done=True) else: tune.report(mean_accuracy=acc)
def dcgan_train(config, checkpoint_dir=None): step = 0 use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") netD = Discriminator().to(device) netD.apply(weights_init) netG = Generator().to(device) netG.apply(weights_init) criterion = nn.BCELoss() optimizerD = optim.Adam(netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): dataloader = get_data_loader() if checkpoint_dir is not None: path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) netD.load_state_dict(checkpoint["netDmodel"]) netG.load_state_dict(checkpoint["netGmodel"]) optimizerD.load_state_dict(checkpoint["optimD"]) optimizerG.load_state_dict(checkpoint["optimG"]) step = checkpoint["step"] if "netD_lr" in config: for param_group in optimizerD.param_groups: param_group["lr"] = config["netD_lr"] if "netG_lr" in config: for param_group in optimizerG.param_groups: param_group["lr"] = config["netG_lr"] while True: lossG, lossD, is_score = train(netD, netG, optimizerG, optimizerD, criterion, dataloader, step, device, config["mnist_model_ref"]) step += 1 with tune.checkpoint_dir(step=step) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save( { "netDmodel": netD.state_dict(), "netGmodel": netG.state_dict(), "optimD": optimizerD.state_dict(), "optimG": optimizerG.state_dict(), "step": step, }, path) tune.report(lossg=lossG, lossd=lossD, is_score=is_score)
def _handle(self, logs: Dict, when: str = None): self._counter[when] += 1 if isinstance(self._frequency, list): index = self._on.index(when) freq = self._frequency[index] else: freq = self._frequency if self._counter[when] % freq == 0: with tune.checkpoint_dir(step=self._cp_count) as checkpoint_dir: self.model.save(os.path.join(checkpoint_dir, self._filename), overwrite=True) self._cp_count += 1
def train(self, train_dataloader, eval_dataloader): if not os.path.exists(self.tmp_path): os.makedirs(self.tmp_path) metrics = {} metrics['accuracy'] = [] metrics['loss'] = [] lr = self.config['learning_rate'] for epoch in range(self.config['max_epoch']): self.model, avg_loss = self.run(train_dataloader, self.model, self.config['learning_rate'], self.config['clip']) self._logger.info( '==>Train Epoch:{:4d} Loss:{:.5f} learning_rate:{}'.format( epoch, avg_loss, lr)) # eval stage avg_eval_acc, avg_eval_loss = self._valid_epoch( eval_dataloader, self.model) self._logger.info('==>Eval Acc:{:.5f} Eval Loss:{:.5f}'.format( avg_eval_acc, avg_eval_loss)) metrics['accuracy'].append(avg_eval_acc) metrics['loss'].append(avg_eval_loss) if self.config['hyper_tune']: # use ray tune to checkpoint with tune.checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") self.save_model(path) # ray tune use loss to determine which params are best tune.report(loss=avg_eval_loss, accuracy=avg_eval_acc) else: save_name_tmp = 'ep_' + str(epoch) + '.m' torch.save(self.model.state_dict(), self.tmp_path + save_name_tmp) self.scheduler.step(avg_eval_acc) # scheduler 会根据 avg_eval_acc 减小学习率 # 若当前学习率小于特定值,则 early stop lr = self.optimizer.param_groups[0]['lr'] if lr < self.config['early_stop_lr']: break if not self.config['hyper_tune'] and self.config['load_best_epoch']: best = np.argmax(metrics['accuracy']) # 这个不是最好的一次吗? load_name_tmp = 'ep_' + str(best) + '.m' self.model.load_state_dict( torch.load(self.tmp_path + load_name_tmp)) # 删除之前创建的临时文件夹 for rt, dirs, files in os.walk(self.tmp_path): for name in files: remove_path = os.path.join(rt, name) os.remove(remove_path) os.rmdir(self.tmp_path)
def checkpoint(progress_tracker, save_path): with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # shutil.copytree(save_path, checkpoint_model) # Note: A previous implementation used shutil.copytree() # however, this copying method is non atomic if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst)
def checkpoint_tune(self, epoch_info=None): """Checkpoint, possibly with tune.""" if epoch_info is None: epoch_info = self.epoch_info if do_tune: with tune.checkpoint_dir(step=self.epochs) as checkpoint_dir: ckpt = self.checkpoint(checkpoint_dir) epoch_info['checkpoint_tune'] = ckpt epoch_info['checkpoint_size'] = os.path.getsize(ckpt) else: ckpt_dir = os.path.join(base_dir, "checkpoint%05d" % epoch_info['epochs']) os.makedirs(ckpt_dir, exist_ok=True) self.checkpoint(ckpt_dir) logging.info(f"Checkpoint available: {ckpt_dir}")
def train(config, checkpoint_dir=None): restored = bool(checkpoint_dir) itr = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "ckpt.log"), "r") as f: itr = int(f.read()) + 1 for i in range(itr, 10): if i == 5 and not restored: raise Exception("try to fail me") with tune.checkpoint_dir() as checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "ckpt.log") with open(checkpoint_path, "w") as f: f.write(str(i)) tune.report(test=i, training_iteration=i)
def experiment(config, checkpoint_dir=None): """Experiment for hyperparameter search.""" learner_obj = learner()() learner_obj.aggregator.callback = callback learner_obj.fit() with tune.checkpoint_dir( step=learner_obj.aggregator.epochs) as checkpoint_dir: predictions_path = os.path.join(checkpoint_dir, "predictions.csv") df_learner_info(learner_obj).to_csv(predictions_path) logging.warning(f"Predictions saved to {predictions_path}") ckpt_path = os.path.join(checkpoint_dir, "learner_ckpt.pkl") state = learner_obj.__getstate__() with open(ckpt_path, 'wb') as f: pickle.dump(state, f) print(f"State saved to {ckpt_path}")
def trainable(config, checkpoint_dir=None): if checkpoint_dir: with open(os.path.join(checkpoint_dir, "chkpt"), "rb") as fp: step = pickle.load(fp) else: step = 0 while step < 2: step += 1 with tune.checkpoint_dir(step) as checkpoint_dir: with open(os.path.join(checkpoint_dir, "chkpt"), "wb") as fp: pickle.dump(step, fp) tune.report(**{ "done": step >= 2, "iter": step, "id": config["id"] })
def checkpoint(progress_tracker, save_path): def ignore_dot_files(src, files): return [f for f in files if f.startswith(".")] with tune.checkpoint_dir( step=progress_tracker.tune_checkpoint_num) as checkpoint_dir: checkpoint_model = os.path.join(checkpoint_dir, "model") # Atomic copying of the checkpoints if not os.path.isdir(checkpoint_model): copy_id = uuid.uuid4() tmp_dst = f"{checkpoint_model}.{copy_id}.tmp" assert os.path.exists(save_path) shutil.copytree(save_path, tmp_dst, ignore=ignore_dot_files) try: os.rename(tmp_dst, checkpoint_model) except Exception: shutil.rmtree(tmp_dst)
def on_epoch_end(self, trainer, pl_module): results = { remove_postfix(k, '_epoch'): v for k, v in trainer.logged_metrics.items() if (k.startswith('train_') or k.startswith('val_')) and not k.endswith('_step') } results['mean_loss'] = results.get('val_loss', results['train_loss']) if 'val_accuracy' in results: results['mean_accuracy'] = results['val_accuracy'] # Checkpointing should be done *before* reporting # https://docs.ray.io/en/master/tune/api_docs/trainable.html with tune.checkpoint_dir(step=trainer.current_epoch) as checkpoint_dir: trainer.save_checkpoint( os.path.join(checkpoint_dir, f"{type(pl_module).__name__}.ckpt")) tune.report(**results)
def MockTrainingFunc(config, checkpoint_dir=None): iter = 0 a = config["a"] b = config["b"] if checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "model.mock") with open(checkpoint_path, "rb") as fp: a, b, iter = pickle.load(fp) while True: iter += 1 with tune.checkpoint_dir(step=iter) as checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "model.mock") with open(checkpoint_path, "wb") as fp: pickle.dump((a, b, iter), fp) tune.report(mean_accuracy=(a - iter) * b)
def test_train(config, checkpoint_dir=None): state = {"hi": 1, "iter": 0} if checkpoint_dir: with open(os.path.join(checkpoint_dir, "ckpt.pkl"), "rb") as fp: state = pickle.load(fp) for i in range(4): state["iter"] += 1 with tune.checkpoint_dir(step=state["iter"]) as dir: with open(os.path.join(dir, "ckpt.pkl"), "wb") as fp: pickle.dump(state, fp) tune.report( **{ "timesteps_this_iter": 1, "metric": state["iter"], "done": state["iter"] > 3 })
def load(self): if tune.is_session_enabled(): with tune.checkpoint_dir( step=self.trainer.state.epoch) as checkpoint_dir: p = os.path.join(checkpoint_dir, "checkpoint.pt") else: file_name = "best_checkpoint.pt" p = os.path.join(self.job_dir, file_name) if not os.path.exists(p): self.logger.info( "Checkpoint {} does not exist, starting a new engine".format( p)) return self.logger.info("Loading saved checkpoint {}".format(p)) checkpoint = torch.load(p) self.network.load_state_dict(checkpoint["model"]) self.optimizer.load_state_dict(checkpoint["optimizer"]) self.trainer.state = checkpoint["engine"]
def fn_trainable(config, checkpoint_dir=None): if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp: state = json.load(fp) else: state = {"internal_iter": 0} for i in range(state["internal_iter"], config["max_iterations"]): state["internal_iter"] = i time.sleep(config["sleep_time"]) if i % config["checkpoint_freq"] == 0: with tune.checkpoint_dir(step=i) as cd: with open(os.path.join(cd, "checkpoint.json"), "wt") as fp: json.dump(state, fp) tune.report(score=i * 10 * config["score_multiplied"], internal_iter=state["internal_iter"])
def train_func(config, checkpoint_dir=None): start = 0 width, height = config["width"], config["height"] if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: state = json.loads(f.read()) start = state["step"] + 1 for step in range(start, 100): intermediate_score = evaluation_fn(step, width, height) # Obtain a checkpoint directory with tune.checkpoint_dir(step=step) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"step": step})) tune.report(iterations=step, mean_loss=intermediate_score)
def on_epoch_end(self, trainer, progress_tracker, save_path): if trainer.is_coordinator(): with tune.checkpoint_dir( step=progress_tracker.epoch) as checkpoint_dir: checkpoint_model = os.path.join( checkpoint_dir, 'model') shutil.copytree(save_path, checkpoint_model) train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics stats = eval_stats or train_stats metric_score = tune_executor.get_metric_score_from_eval_stats( stats)[-1] tune.report(parameters=json.dumps(config, cls=NumpyEncoder), metric_score=metric_score, training_stats=json.dumps(train_stats, cls=NumpyEncoder), eval_stats=json.dumps(eval_stats, cls=NumpyEncoder))
def train(config, checkpoint_dir=None): step = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: step = json.loads(f.read())["timestep"] for timestep in range(step, 100): v = np.tanh(float(timestep) / config.get("width", 1)) v *= config.get("height", 1) if timestep % 3 == 0: with tune.checkpoint_dir(step=timestep) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") with open(path, "w") as f: f.write(json.dumps({"timestep": timestep})) # Here we use `episode_reward_mean`, but you can also report other # objectives such as loss or accuracy. tune.report(episode_reward_mean=v)
def MockTrainingFuncSync(config, checkpoint_dir=None): iter = 0 if checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") with open(checkpoint_path, "rb") as fp: a, iter = pickle.load(fp) a = config["a"] # Use the new hyperparameter if perturbed. while True: iter += 1 with tune.checkpoint_dir(step=iter) as checkpoint_dir: checkpoint_path = os.path.join(checkpoint_dir, "checkpoint") with open(checkpoint_path, "wb") as fp: pickle.dump((a, iter), fp) # Score gets better every iteration. time.sleep(1) tune.report(mean_accuracy=iter + a, a=a)
def train_convnet(config, checkpoint_dir=None): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If checkpoint_dir is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if checkpoint_dir: print("Loading from checkpoint.") path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. with tune.checkpoint_dir(step=step) as checkpoint_dir: # Then create a checkpoint file in this directory. path = os.path.join(checkpoint_dir, "checkpoint") # Save state to checkpoint file. # No need to save optimizer for SGD. torch.save( { "step": step, "model_state_dict": model.state_dict(), "mean_accuracy": acc, }, path, ) step += 1 tune.report(mean_accuracy=acc)
def function_trainable(config): num_iters = int(config["num_iters"]) sleep_time = config["sleep_time"] score = config["score"] checkpoint_iters = config["checkpoint_iters"] checkpoint_size_b = config["checkpoint_size_b"] checkpoint_num_items = checkpoint_size_b // 8 # np.float64 for i in range(num_iters): if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \ i % checkpoint_iters == 0: with tune.checkpoint_dir(step=i) as dir: checkpoint_file = os.path.join(dir, "bogus.ckpt") checkpoint_data = np.random.uniform( 0, 1, size=checkpoint_num_items) with open(checkpoint_file, "wb") as fp: pickle.dump(checkpoint_data, fp) tune.report(score=i + score) time.sleep(sleep_time)
def post_epoch_actions(trainer_instance: Engine): # evaluate model on validation set evaluator.run(val_loader) state_val_metrics = evaluator.state.metrics current_epoch: int = trainer_instance.state.epoch with tune.checkpoint_dir(current_epoch) as local_checkpoint_dir: # save model, optimizer and trainer checkpoints path = os.path.join(local_checkpoint_dir, "checkpoint") torch.save( (model.state_dict(), optimizer.state_dict(), trainer_instance.state_dict(), evaluator.state_dict()), path) # report validation scores to ray-tune report_dict: dict = { **state_val_metrics, "done": current_epoch == epochs } tune.report(**report_dict)
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format(results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v) ) # Remove extra memory cache of main process due to evaluation torch.cuda.empty_cache() self.step += 1 # Here we save a checkpoint. It is automatically registered with # Ray Tune and will potentially be passed as the `checkpoint_dir` # parameter in future iterations. with tune.checkpoint_dir(step=self.step) as checkpoint_dir: additional_state = {"iteration": int(self.trainer.iter)} Checkpointer( # Assume you want to save checkpoints together with logs/statistics self.trainer.model, checkpoint_dir, save_to_disk=True, optimizer=self.trainer.optimizer, scheduler=self.trainer.scheduler, ).save(name="checkpoint", **additional_state) metrics = dict(r1=results['Rank-1'], map=results['mAP'], score=(results['Rank-1'] + results['mAP']) / 2) tune.report(**metrics)