def after_iteration(self, model, epoch, evals_log):
     # ray.get to make sure this is up to date in the next iteration
     ray.get(self.ft_manager.log_iteration.remote(get_actor_rank(), epoch))
     if self.training_delay > 0:
         time.sleep(self.training_delay)
     if get_actor_rank() == 0:
         ray.get(self.ft_manager.inc_boost_round.remote(get_actor_rank()))
Beispiel #2
0
        def after_iteration(self, model, epoch, evals_log):

            if get_actor_rank() == actor_rank:
                put_queue((epoch, time.time()))
            if get_actor_rank() == actor_rank and \
               epoch == fail_iteration and \
               not os.path.exists(die_lock_file):

                with open(die_lock_file, "wt") as fp:
                    fp.write("")
                time.sleep(2)
                import sys
                print(f"Testing: Rank {get_actor_rank()} will now fail.")
                sys.exit(1)
Beispiel #3
0
        def after_iteration(self, model, epoch, evals_log):
            if get_actor_rank() == actor_rank:
                put_queue((epoch, time.time()))
            if get_actor_rank() == actor_rank and \
                    epoch == fail_iteration and \
                    not os.path.exists(die_lock_file):

                # Get PID
                pid = os.getpid()
                print(f"Killing process: {pid}")
                with open(die_lock_file, "wt") as fp:
                    fp.write("")

                time.sleep(2)
                print(f"Testing: Rank {get_actor_rank()} will now die.")
                os.kill(pid, 9)
 def before_iteration(self, model, epoch, evals_log):
     if ray.get(self.ft_manager.should_die.remote(get_actor_rank())):
         pid = os.getpid()
         print(f"Killing process: {pid}")
         print(f"Rank {get_actor_rank()} will now die.")
         time.sleep(1)
         os.kill(pid, 9)
         time.sleep(10)  # Don't continue training, just die
Beispiel #5
0
    def after_iteration(self, model, epoch, evals_log):
        if epoch == self._iteration:
            rank = get_actor_rank()
            if rank in self._ranks:
                if not ray.get(self._state.has_failed.remote(self._id)):
                    success = ray.get(self._state.set_failed.remote(self._id))
                    if not success:
                        # Another rank is already about to fail
                        return

                    pid = os.getpid()
                    print(f"Killing process: {pid} for actor rank {rank}")
                    time.sleep(1)
                    os.kill(pid, 9)
Beispiel #6
0
 def after_iteration(self, model, epoch: int, evals_log: Dict):
     if get_actor_rank() == 0:
         put_queue(lambda: self._create_checkpoint(
             model, epoch, self._filename, self._frequency))
Beispiel #7
0
 def after_iteration(self, model, epoch: int, evals_log: Dict):
     if get_actor_rank() == 0:
         report_dict = self._get_report_dict(evals_log)
         put_queue(lambda: tune.report(**report_dict))
Beispiel #8
0
 def after_iteration(self, model, epoch: int, evals_log: Dict):
     if get_actor_rank() == 0:
         self._checkpoint.after_iteration(model, epoch, evals_log)
         self._report.after_iteration(model, epoch, evals_log)
Beispiel #9
0
 def before_iteration(self, model, epoch, evals_log):
     if get_actor_rank() == 3:
         print(f"[Rank {get_actor_rank()}] I am at iteration {epoch}")
     put_queue(get_world_size())
Beispiel #10
0
 def __call__(self, env):
     if get_actor_rank() == 0:
         result_dict = dict(env.evaluation_result_list)
         put_queue(lambda: tune.report(**result_dict))
Beispiel #11
0
 def after_iteration(self, model, epoch, evals_log):
     print(f"My rank: {get_actor_rank()}")
     put_queue(("rank", get_actor_rank()))
Beispiel #12
0
 def callback(env):
     print(f"My rank: {get_actor_rank()}")
     put_queue(("rank", get_actor_rank()))