Python checkpoint_dir Exemples, ray.tune.checkpoint_dir Python Exemples

Exemple #1

0

Afficher le fichier

        def train(config, checkpoint_dir=None):
            if checkpoint_dir:
                count = sum("checkpoint-" in path
                            for path in os.listdir(checkpoint_dir))
                assert count == 1, os.listdir(checkpoint_dir)

            for step in range(20):
                with tune.checkpoint_dir(step=step) as checkpoint_dir:
                    path = os.path.join(checkpoint_dir,
                                        "checkpoint-{}".format(step))
                    open(path, "a").close()
                tune.report(test=step)

Exemple #2

0

Afficher le fichier

Fichier : mxnet.py Projet : miqdigital/ray

 def __call__(
     self,
     epoch: int,
     sym: mxnet.symbol.Symbol,
     arg: Dict[str, np.ndarray],
     aux: Dict[str, np.ndarray],
 ):
     if epoch % self._frequency != 0:
         return
     with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
         save_checkpoint(os.path.join(checkpoint_dir, self._filename),
                         epoch, sym, arg, aux)

Exemple #3

0

Afficher le fichier

        def train(config, checkpoint_dir=None):
            itr = 0
            if checkpoint_dir:
                with open(os.path.join(checkpoint_dir, "ckpt.log"), "r") as f:
                    itr = int(f.read()) + 1

            for i in range(itr, config["max_iter"]):
                with tune.checkpoint_dir(step=i) as checkpoint_dir:
                    checkpoint_path = os.path.join(checkpoint_dir, "ckpt.log")
                    with open(checkpoint_path, "w") as f:
                        f.write(str(i))
                tune.report(test=i, training_iteration=i)

Exemple #4

0

Afficher le fichier

Fichier : AutoML CNN Demo.py Projet : vespidae/CIFAR-Trainer

def nas_report(study, trial):
    best_session = study.best_trials[0]
    print("Trial stats (#{}):    Loss={}    Accuracy={}".format(
        trial.number, *(list(best_session.values))))
    print("Best params so far (#{}):    {}".format(best_session.number,
                                                   best_session.params))

    finished_trials = list(
        filter((lambda trial: trial.state.is_finished()), study.trials))

    model_state = {}
    with tune.checkpoint_dir(step=best_session.number) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        model_state = torch.load(path)

    with tune.checkpoint_dir(step=trial.number) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        torch.save((best_session.params, model_state), path)

    result_zip = zip(["loss", "accuracy"], list(best_session.values))
    results = {p: v for p, v in result_zip}
    tune.report(**results)

Exemple #5

0

Afficher le fichier

Fichier : trainer.py Projet : MetaMind/ray-internal

 def save_state(self):
     with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
         self.args.output_dir = checkpoint_dir
         # This is the directory name that Huggingface requires.
         output_dir = os.path.join(
             self.args.output_dir,
             f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
         self.save_model(output_dir)
         if self.is_world_master():
             torch.save(self.current_optimizer.state_dict(),
                        os.path.join(output_dir, "optimizer.pt"))
             torch.save(self.current_scheduler.state_dict(),
                        os.path.join(output_dir, "scheduler.pt"))

Exemple #6

0

Afficher le fichier

 def write_checkpoint(self, checkpoint: Dict):
     # Store the checkpoint_id in the file so that the Tune trial can be
     # resumed after failure or cancellation.
     checkpoint[TUNE_CHECKPOINT_ID] = self._latest_checkpoint_id
     # If inside a Tune Trainable, then checkpoint with Tune.
     with tune.checkpoint_dir(step=self._latest_checkpoint_id) as \
             checkpoint_dir:
         path = Path(checkpoint_dir)
         # Use a standard file name so that we know which file to load
         # the checkpoint from.
         file_path = path.joinpath(TUNE_CHECKPOINT_FILE_NAME)
         with file_path.open("wb") as f:
             cloudpickle.dump(checkpoint, f)

Exemple #7

0

Afficher le fichier

Fichier : tune-serve-integration-mnist.py Projet : MetaMind/ray-internal

def train_mnist(config,
                start_model=None,
                checkpoint_dir=None,
                num_epochs=10,
                use_gpus=False,
                data_fn=None,
                day=0):
    # Create model
    use_cuda = use_gpus and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = ConvNet(layer_size=config["layer_size"]).to(device)

    # Create optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=config["lr"],
                          momentum=config["momentum"])

    # Load checkpoint, or load start model if no checkpoint has been
    # passed and a start model is specified
    load_dir = None
    if checkpoint_dir:
        load_dir = checkpoint_dir
    elif start_model:
        load_dir = start_model

    if load_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(load_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    # Get full training datasets
    train_dataset, validation_dataset = data_fn(day=day)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=config["batch_size"],
                                               shuffle=True)

    validation_loader = torch.utils.data.DataLoader(
        validation_dataset, batch_size=config["batch_size"], shuffle=True)

    for i in range(num_epochs):
        train(model, optimizer, train_loader, device)
        acc = test(model, validation_loader, device)
        if i == num_epochs - 1:
            with tune.checkpoint_dir(step=i) as checkpoint_dir:
                torch.save((model.state_dict(), optimizer.state_dict()),
                           os.path.join(checkpoint_dir, "checkpoint"))
            tune.report(mean_accuracy=acc, done=True)
        else:
            tune.report(mean_accuracy=acc)

Exemple #8

0

Afficher le fichier

Fichier : pbt_dcgan_mnist_func.py Projet : dzorlu/minerl_rllib

def dcgan_train(config, checkpoint_dir=None):
    step = 0
    use_cuda = config.get("use_gpu") and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    netD = Discriminator().to(device)
    netD.apply(weights_init)
    netG = Generator().to(device)
    netG.apply(weights_init)
    criterion = nn.BCELoss()
    optimizerD = optim.Adam(netD.parameters(),
                            lr=config.get("lr", 0.01),
                            betas=(beta1, 0.999))
    optimizerG = optim.Adam(netG.parameters(),
                            lr=config.get("lr", 0.01),
                            betas=(beta1, 0.999))
    with FileLock(os.path.expanduser("~/.data.lock")):
        dataloader = get_data_loader()

    if checkpoint_dir is not None:
        path = os.path.join(checkpoint_dir, "checkpoint")
        checkpoint = torch.load(path)
        netD.load_state_dict(checkpoint["netDmodel"])
        netG.load_state_dict(checkpoint["netGmodel"])
        optimizerD.load_state_dict(checkpoint["optimD"])
        optimizerG.load_state_dict(checkpoint["optimG"])
        step = checkpoint["step"]

        if "netD_lr" in config:
            for param_group in optimizerD.param_groups:
                param_group["lr"] = config["netD_lr"]
        if "netG_lr" in config:
            for param_group in optimizerG.param_groups:
                param_group["lr"] = config["netG_lr"]

    while True:
        lossG, lossD, is_score = train(netD, netG, optimizerG, optimizerD,
                                       criterion, dataloader, step, device,
                                       config["mnist_model_ref"])
        step += 1
        with tune.checkpoint_dir(step=step) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(
                {
                    "netDmodel": netD.state_dict(),
                    "netGmodel": netG.state_dict(),
                    "optimD": optimizerD.state_dict(),
                    "optimG": optimizerG.state_dict(),
                    "step": step,
                }, path)
        tune.report(lossg=lossG, lossd=lossD, is_score=is_score)

Exemple #9

0

Afficher le fichier

    def _handle(self, logs: Dict, when: str = None):
        self._counter[when] += 1

        if isinstance(self._frequency, list):
            index = self._on.index(when)
            freq = self._frequency[index]
        else:
            freq = self._frequency

        if self._counter[when] % freq == 0:
            with tune.checkpoint_dir(step=self._cp_count) as checkpoint_dir:
                self.model.save(os.path.join(checkpoint_dir, self._filename),
                                overwrite=True)
                self._cp_count += 1

Exemple #10

0

Afficher le fichier

Fichier : traj_loc_pred_executor.py Projet : LibTraffic/Bigscity-LibTraffic-Docs-zh_CN

 def train(self, train_dataloader, eval_dataloader):
     if not os.path.exists(self.tmp_path):
         os.makedirs(self.tmp_path)
     metrics = {}
     metrics['accuracy'] = []
     metrics['loss'] = []
     lr = self.config['learning_rate']
     for epoch in range(self.config['max_epoch']):
         self.model, avg_loss = self.run(train_dataloader, self.model,
                                         self.config['learning_rate'],
                                         self.config['clip'])
         self._logger.info(
             '==>Train Epoch:{:4d} Loss:{:.5f} learning_rate:{}'.format(
                 epoch, avg_loss, lr))
         # eval stage
         avg_eval_acc, avg_eval_loss = self._valid_epoch(
             eval_dataloader, self.model)
         self._logger.info('==>Eval Acc:{:.5f} Eval Loss:{:.5f}'.format(
             avg_eval_acc, avg_eval_loss))
         metrics['accuracy'].append(avg_eval_acc)
         metrics['loss'].append(avg_eval_loss)
         if self.config['hyper_tune']:
             # use ray tune to checkpoint
             with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
                 path = os.path.join(checkpoint_dir, "checkpoint")
                 self.save_model(path)
             # ray tune use loss to determine which params are best
             tune.report(loss=avg_eval_loss, accuracy=avg_eval_acc)
         else:
             save_name_tmp = 'ep_' + str(epoch) + '.m'
             torch.save(self.model.state_dict(),
                        self.tmp_path + save_name_tmp)
         self.scheduler.step(avg_eval_acc)
         # scheduler 会根据 avg_eval_acc 减小学习率
         # 若当前学习率小于特定值，则 early stop
         lr = self.optimizer.param_groups[0]['lr']
         if lr < self.config['early_stop_lr']:
             break
     if not self.config['hyper_tune'] and self.config['load_best_epoch']:
         best = np.argmax(metrics['accuracy'])  # 这个不是最好的一次吗？
         load_name_tmp = 'ep_' + str(best) + '.m'
         self.model.load_state_dict(
             torch.load(self.tmp_path + load_name_tmp))
     # 删除之前创建的临时文件夹
     for rt, dirs, files in os.walk(self.tmp_path):
         for name in files:
             remove_path = os.path.join(rt, name)
             os.remove(remove_path)
     os.rmdir(self.tmp_path)

Exemple #11

0

Afficher le fichier

Fichier : execution.py Projet : yarenty/ludwig

 def checkpoint(progress_tracker, save_path):
     with tune.checkpoint_dir(step=progress_tracker.epoch) as checkpoint_dir:
         checkpoint_model = os.path.join(checkpoint_dir, "model")
         # shutil.copytree(save_path, checkpoint_model)
         # Note: A previous implementation used shutil.copytree()
         # however, this copying method is non atomic
         if not os.path.isdir(checkpoint_model):
             copy_id = uuid.uuid4()
             tmp_dst = f"{checkpoint_model}.{copy_id}.tmp"
             assert os.path.exists(save_path)
             shutil.copytree(save_path, tmp_dst)
             try:
                 os.rename(tmp_dst, checkpoint_model)
             except Exception:
                 shutil.rmtree(tmp_dst)

Exemple #12

0

Afficher le fichier

Fichier : sacred_wrapper.py Projet : sergeivolodin/causality-disentanglement-rl

    def checkpoint_tune(self, epoch_info=None):
        """Checkpoint, possibly with tune."""
        if epoch_info is None:
            epoch_info = self.epoch_info

        if do_tune:
            with tune.checkpoint_dir(step=self.epochs) as checkpoint_dir:
                ckpt = self.checkpoint(checkpoint_dir)
                epoch_info['checkpoint_tune'] = ckpt
                epoch_info['checkpoint_size'] = os.path.getsize(ckpt)
        else:
            ckpt_dir = os.path.join(base_dir, "checkpoint%05d" % epoch_info['epochs'])
            os.makedirs(ckpt_dir, exist_ok=True)
            self.checkpoint(ckpt_dir)
            logging.info(f"Checkpoint available: {ckpt_dir}")

Exemple #13

0

Afficher le fichier

Fichier : test_function_api.py Projet : AmeerHajAli/ray2

        def train(config, checkpoint_dir=None):
            restored = bool(checkpoint_dir)
            itr = 0
            if checkpoint_dir:
                with open(os.path.join(checkpoint_dir, "ckpt.log"), "r") as f:
                    itr = int(f.read()) + 1

            for i in range(itr, 10):
                if i == 5 and not restored:
                    raise Exception("try to fail me")
                with tune.checkpoint_dir() as checkpoint_dir:
                    checkpoint_path = os.path.join(checkpoint_dir, "ckpt.log")
                    with open(checkpoint_path, "w") as f:
                        f.write(str(i))
                tune.report(test=i, training_iteration=i)

Exemple #14

0

Afficher le fichier

Fichier : ml_train.py Projet : tournesol-app/tournesol

def experiment(config, checkpoint_dir=None):
    """Experiment for hyperparameter search."""
    learner_obj = learner()()
    learner_obj.aggregator.callback = callback
    learner_obj.fit()
    with tune.checkpoint_dir(
            step=learner_obj.aggregator.epochs) as checkpoint_dir:
        predictions_path = os.path.join(checkpoint_dir, "predictions.csv")
        df_learner_info(learner_obj).to_csv(predictions_path)
        logging.warning(f"Predictions saved to {predictions_path}")

        ckpt_path = os.path.join(checkpoint_dir, "learner_ckpt.pkl")
        state = learner_obj.__getstate__()
        with open(ckpt_path, 'wb') as f:
            pickle.dump(state, f)
        print(f"State saved to {ckpt_path}")

Exemple #15

0

Afficher le fichier

    def trainable(config, checkpoint_dir=None):
        if checkpoint_dir:
            with open(os.path.join(checkpoint_dir, "chkpt"), "rb") as fp:
                step = pickle.load(fp)
        else:
            step = 0

        while step < 2:
            step += 1
            with tune.checkpoint_dir(step) as checkpoint_dir:
                with open(os.path.join(checkpoint_dir, "chkpt"), "wb") as fp:
                    pickle.dump(step, fp)
            tune.report(**{
                "done": step >= 2,
                "iter": step,
                "id": config["id"]
            })

Exemple #16

0

Afficher le fichier

Fichier : execution.py Projet : ludwig-ai/ludwig

def checkpoint(progress_tracker, save_path):
    def ignore_dot_files(src, files):
        return [f for f in files if f.startswith(".")]

    with tune.checkpoint_dir(
            step=progress_tracker.tune_checkpoint_num) as checkpoint_dir:
        checkpoint_model = os.path.join(checkpoint_dir, "model")
        # Atomic copying of the checkpoints
        if not os.path.isdir(checkpoint_model):
            copy_id = uuid.uuid4()
            tmp_dst = f"{checkpoint_model}.{copy_id}.tmp"
            assert os.path.exists(save_path)
            shutil.copytree(save_path, tmp_dst, ignore=ignore_dot_files)
            try:
                os.rename(tmp_dst, checkpoint_model)
            except Exception:
                shutil.rmtree(tmp_dst)

Exemple #17

0

Afficher le fichier

Fichier : ray_runner.py Projet : microsoft/semiparametric-distillation

 def on_epoch_end(self, trainer, pl_module):
     results = {
         remove_postfix(k, '_epoch'): v
         for k, v in trainer.logged_metrics.items()
         if (k.startswith('train_') or k.startswith('val_'))
         and not k.endswith('_step')
     }
     results['mean_loss'] = results.get('val_loss', results['train_loss'])
     if 'val_accuracy' in results:
         results['mean_accuracy'] = results['val_accuracy']
     # Checkpointing should be done *before* reporting
     # https://docs.ray.io/en/master/tune/api_docs/trainable.html
     with tune.checkpoint_dir(step=trainer.current_epoch) as checkpoint_dir:
         trainer.save_checkpoint(
             os.path.join(checkpoint_dir,
                          f"{type(pl_module).__name__}.ckpt"))
     tune.report(**results)

Exemple #18

0

Afficher le fichier

def MockTrainingFunc(config, checkpoint_dir=None):
    iter = 0
    a = config["a"]
    b = config["b"]

    if checkpoint_dir:
        checkpoint_path = os.path.join(checkpoint_dir, "model.mock")
        with open(checkpoint_path, "rb") as fp:
            a, b, iter = pickle.load(fp)

    while True:
        iter += 1
        with tune.checkpoint_dir(step=iter) as checkpoint_dir:
            checkpoint_path = os.path.join(checkpoint_dir, "model.mock")
            with open(checkpoint_path, "wb") as fp:
                pickle.dump((a, b, iter), fp)
        tune.report(mean_accuracy=(a - iter) * b)

Exemple #19

0

Afficher le fichier

Fichier : test_api.py Projet : yuan776/ray

        def test_train(config, checkpoint_dir=None):
            state = {"hi": 1, "iter": 0}
            if checkpoint_dir:
                with open(os.path.join(checkpoint_dir, "ckpt.pkl"),
                          "rb") as fp:
                    state = pickle.load(fp)

            for i in range(4):
                state["iter"] += 1
                with tune.checkpoint_dir(step=state["iter"]) as dir:
                    with open(os.path.join(dir, "ckpt.pkl"), "wb") as fp:
                        pickle.dump(state, fp)
                tune.report(
                    **{
                        "timesteps_this_iter": 1,
                        "metric": state["iter"],
                        "done": state["iter"] > 3
                    })

Exemple #20

0

Afficher le fichier

 def load(self):
     if tune.is_session_enabled():
         with tune.checkpoint_dir(
                 step=self.trainer.state.epoch) as checkpoint_dir:
             p = os.path.join(checkpoint_dir, "checkpoint.pt")
     else:
         file_name = "best_checkpoint.pt"
         p = os.path.join(self.job_dir, file_name)
     if not os.path.exists(p):
         self.logger.info(
             "Checkpoint {} does not exist, starting a new engine".format(
                 p))
         return
     self.logger.info("Loading saved checkpoint {}".format(p))
     checkpoint = torch.load(p)
     self.network.load_state_dict(checkpoint["model"])
     self.optimizer.load_state_dict(checkpoint["optimizer"])
     self.trainer.state = checkpoint["engine"]

Exemple #21

0

Afficher le fichier

Fichier : _tune_script.py Projet : stjordanis/ray

def fn_trainable(config, checkpoint_dir=None):
    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint.json"), "rt") as fp:
            state = json.load(fp)
    else:
        state = {"internal_iter": 0}

    for i in range(state["internal_iter"], config["max_iterations"]):
        state["internal_iter"] = i
        time.sleep(config["sleep_time"])

        if i % config["checkpoint_freq"] == 0:
            with tune.checkpoint_dir(step=i) as cd:
                with open(os.path.join(cd, "checkpoint.json"), "wt") as fp:
                    json.dump(state, fp)

        tune.report(score=i * 10 * config["score_multiplied"],
                    internal_iter=state["internal_iter"])

Exemple #22

0

Afficher le fichier

Fichier : custom_func_checkpointing.py Projet : wuisawesome/ray

def train_func(config, checkpoint_dir=None):
    start = 0
    width, height = config["width"], config["height"]

    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            state = json.loads(f.read())
            start = state["step"] + 1

    for step in range(start, 100):
        intermediate_score = evaluation_fn(step, width, height)

        # Obtain a checkpoint directory
        with tune.checkpoint_dir(step=step) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            with open(path, "w") as f:
                f.write(json.dumps({"step": step}))

        tune.report(iterations=step, mean_loss=intermediate_score)

Exemple #23

0

Afficher le fichier

            def on_epoch_end(self, trainer, progress_tracker, save_path):
                if trainer.is_coordinator():
                    with tune.checkpoint_dir(
                            step=progress_tracker.epoch) as checkpoint_dir:
                        checkpoint_model = os.path.join(
                            checkpoint_dir, 'model')
                        shutil.copytree(save_path, checkpoint_model)

                    train_stats, eval_stats = progress_tracker.train_metrics, progress_tracker.vali_metrics
                    stats = eval_stats or train_stats
                    metric_score = tune_executor.get_metric_score_from_eval_stats(
                        stats)[-1]
                    tune.report(parameters=json.dumps(config,
                                                      cls=NumpyEncoder),
                                metric_score=metric_score,
                                training_stats=json.dumps(train_stats,
                                                          cls=NumpyEncoder),
                                eval_stats=json.dumps(eval_stats,
                                                      cls=NumpyEncoder))

Exemple #24

0

Afficher le fichier

Fichier : hyperband_function_example.py Projet : zzmcdc/ray

def train(config, checkpoint_dir=None):
    step = 0
    if checkpoint_dir:
        with open(os.path.join(checkpoint_dir, "checkpoint")) as f:
            step = json.loads(f.read())["timestep"]

    for timestep in range(step, 100):
        v = np.tanh(float(timestep) / config.get("width", 1))
        v *= config.get("height", 1)

        if timestep % 3 == 0:
            with tune.checkpoint_dir(step=timestep) as checkpoint_dir:
                path = os.path.join(checkpoint_dir, "checkpoint")
                with open(path, "w") as f:
                    f.write(json.dumps({"timestep": timestep}))

        # Here we use `episode_reward_mean`, but you can also report other
        # objectives such as loss or accuracy.
        tune.report(episode_reward_mean=v)

Exemple #25

0

Afficher le fichier

        def MockTrainingFuncSync(config, checkpoint_dir=None):
            iter = 0

            if checkpoint_dir:
                checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
                with open(checkpoint_path, "rb") as fp:
                    a, iter = pickle.load(fp)

            a = config["a"]  # Use the new hyperparameter if perturbed.

            while True:
                iter += 1
                with tune.checkpoint_dir(step=iter) as checkpoint_dir:
                    checkpoint_path = os.path.join(checkpoint_dir,
                                                   "checkpoint")
                    with open(checkpoint_path, "wb") as fp:
                        pickle.dump((a, iter), fp)
                # Score gets better every iteration.
                time.sleep(1)
                tune.report(mean_accuracy=iter + a, a=a)

Exemple #26

0

Afficher le fichier

def train_convnet(config, checkpoint_dir=None):
    # Create our data loaders, model, and optmizer.
    step = 0
    train_loader, test_loader = get_data_loaders()
    model = ConvNet()
    optimizer = optim.SGD(
        model.parameters(),
        lr=config.get("lr", 0.01),
        momentum=config.get("momentum", 0.9),
    )

    # If checkpoint_dir is not None, then we are resuming from a checkpoint.
    # Load model state and iteration step from checkpoint.
    if checkpoint_dir:
        print("Loading from checkpoint.")
        path = os.path.join(checkpoint_dir, "checkpoint")
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint["model_state_dict"])
        step = checkpoint["step"]

    while True:
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        if step % 5 == 0:
            # Every 5 steps, checkpoint our current state.
            # First get the checkpoint directory from tune.
            with tune.checkpoint_dir(step=step) as checkpoint_dir:
                # Then create a checkpoint file in this directory.
                path = os.path.join(checkpoint_dir, "checkpoint")
                # Save state to checkpoint file.
                # No need to save optimizer for SGD.
                torch.save(
                    {
                        "step": step,
                        "model_state_dict": model.state_dict(),
                        "mean_accuracy": acc,
                    },
                    path,
                )
        step += 1
        tune.report(mean_accuracy=acc)

Exemple #27

0

Afficher le fichier

Fichier : release_test_util.py Projet : stjordanis/ray

def function_trainable(config):
    num_iters = int(config["num_iters"])
    sleep_time = config["sleep_time"]
    score = config["score"]

    checkpoint_iters = config["checkpoint_iters"]
    checkpoint_size_b = config["checkpoint_size_b"]
    checkpoint_num_items = checkpoint_size_b // 8  # np.float64

    for i in range(num_iters):
        if checkpoint_iters >= 0 and checkpoint_size_b > 0 and \
           i % checkpoint_iters == 0:
            with tune.checkpoint_dir(step=i) as dir:
                checkpoint_file = os.path.join(dir, "bogus.ckpt")
                checkpoint_data = np.random.uniform(
                    0, 1, size=checkpoint_num_items)
                with open(checkpoint_file, "wb") as fp:
                    pickle.dump(checkpoint_data, fp)

        tune.report(score=i + score)
        time.sleep(sleep_time)

Exemple #28

0

Afficher le fichier

Fichier : optimization.py Projet : kainkordian/ELPTforDSP

        def post_epoch_actions(trainer_instance: Engine):

            # evaluate model on validation set
            evaluator.run(val_loader)
            state_val_metrics = evaluator.state.metrics

            current_epoch: int = trainer_instance.state.epoch

            with tune.checkpoint_dir(current_epoch) as local_checkpoint_dir:
                # save model, optimizer and trainer checkpoints
                path = os.path.join(local_checkpoint_dir, "checkpoint")
                torch.save(
                    (model.state_dict(), optimizer.state_dict(),
                     trainer_instance.state_dict(), evaluator.state_dict()),
                    path)

            # report validation scores to ray-tune
            report_dict: dict = {
                **state_val_metrics, "done": current_epoch == epochs
            }

            tune.report(**report_dict)

Exemple #29

0

Afficher le fichier

Fichier : Train Ray RLLib Hierarchical.py Projet : AdityaPutraS/Imitation-Learning-RL

def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)

Exemple #30

0

Afficher le fichier

Fichier : tune_hooks.py Projet : zyg11/fast-reid

    def _do_eval(self):
        results = self._func()

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(results)

            flattened_results = flatten_results_dict(results)
            for k, v in flattened_results.items():
                try:
                    v = float(v)
                except Exception:
                    raise ValueError(
                        "[EvalHook] eval_function should return a nested dict of float. "
                        "Got '{}: {}' instead.".format(k, v)
                    )

        # Remove extra memory cache of main process due to evaluation
        torch.cuda.empty_cache()

        self.step += 1

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and will potentially be passed as the `checkpoint_dir`
        # parameter in future iterations.
        with tune.checkpoint_dir(step=self.step) as checkpoint_dir:
            additional_state = {"iteration": int(self.trainer.iter)}
            Checkpointer(
                # Assume you want to save checkpoints together with logs/statistics
                self.trainer.model,
                checkpoint_dir,
                save_to_disk=True,
                optimizer=self.trainer.optimizer,
                scheduler=self.trainer.scheduler,
            ).save(name="checkpoint", **additional_state)

        metrics = dict(r1=results['Rank-1'], map=results['mAP'], score=(results['Rank-1'] + results['mAP']) / 2)
        tune.report(**metrics)