def testTrainingContinuationKilled(self): """This should continue after one actor died.""" additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # End with two working actors self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Two workers finished, so N=32 self.assertEqual(additional_results["total_n"], 32)
def testTrainingContinuationElasticMultiKilled(self): """This should still show 20 boost rounds after two failures.""" logging.getLogger().setLevel(10) additional_results = {} bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, fail_iteration=6, actor_rank=0), _kill_callback(self.die_lock_file_2, fail_iteration=14, actor_rank=1), ], num_boost_round=20, ray_params=RayParams(max_actor_restarts=2, num_actors=2, elastic_training=True, max_failed_actors=2), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}")
def testTrainPredict(self, init=True, remote=None, **ray_param_dict): """Train with evaluation and predict""" if init: ray.init(num_cpus=2, num_gpus=0) dtrain = RayDMatrix(self.x, self.y) evals_result = {} bst = train(self.params, dtrain, num_boost_round=38, ray_params=RayParams(num_actors=2, **ray_param_dict), evals=[(dtrain, "dtrain")], evals_result=evals_result, _remote=remote) self.assertEqual(get_num_trees(bst), 38) self.assertTrue("dtrain" in evals_result) x_mat = RayDMatrix(self.x) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2, **ray_param_dict), _remote=remote) self.assertSequenceEqual(list(self.y), list(pred_y))
def testTrainingContinuationElasticKilledRestarted(self): """This should continue after one actor died and restart it.""" logging.getLogger().setLevel(10) ft_manager = FaultToleranceManager.remote() ft_manager.schedule_kill.remote(rank=0, boost_round=6) ft_manager.delay_return.remote(rank=1, start_boost_round=12, end_boost_round=21) delay_callback = DelayedLoadingCallback(ft_manager, reload_data=True, sleep_time=0.1) die_callback = DieCallback(ft_manager, training_delay=0.25) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[die_callback], num_boost_round=20, ray_params=RayParams( max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1, distributed_callbacks=[delay_callback]), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor gets recreated self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Both workers finished, so n=32 self.assertEqual(additional_results["total_n"], 32)
def testTrainingContinuationElasticKilledRestarted(self): """This should continue after one actor died and restart it.""" logging.getLogger().setLevel(10) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[ _kill_callback(self.die_lock_file, fail_iteration=6), _sleep_callback(sleep_iteration=7, sleep_seconds=15), _sleep_callback(sleep_iteration=9, sleep_seconds=5) ], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor gets recreated self.assertTrue(actors[0]) self.assertTrue(actors[1]) # Both workers finished, so n=32 self.assertEqual(additional_results["total_n"], 32)
def testTrainPredict(self, init=True, remote=None, softprob=False, **ray_param_dict): """Train with evaluation and predict""" if init: ray.init(num_cpus=2, num_gpus=0) dtrain = RayDMatrix(self.x, self.y) params = self.params if softprob: params = params.copy() params["objective"] = "multi:softprob" evals_result = {} bst = train(params, dtrain, num_boost_round=38, ray_params=RayParams(num_actors=2, **ray_param_dict), evals=[(dtrain, "dtrain")], evals_result=evals_result, _remote=remote) self.assertEqual(get_num_trees(bst), 38) self.assertTrue("dtrain" in evals_result) x_mat = RayDMatrix(self.x) pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2, **ray_param_dict), _remote=remote) if softprob: self.assertEqual(pred_y.shape[1], len(np.unique(self.y))) pred_y = np.argmax(pred_y, axis=1) self.assertSequenceEqual(list(self.y), list(pred_y))
def testTrainingContinuationElasticKilled(self): """This should continue after one actor died.""" os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1" logging.getLogger().setLevel(10) additional_results = {} keep_actors = {} def keep(actors, *args, **kwargs): keep_actors["actors"] = actors.copy() return DEFAULT with patch("xgboost_ray.main._shutdown") as mocked: mocked.side_effect = keep bst = train(self.params, RayDMatrix(self.x, self.y), callbacks=[_kill_callback(self.die_lock_file)], num_boost_round=20, ray_params=RayParams(max_actor_restarts=1, num_actors=2, elastic_training=True, max_failed_actors=1), additional_results=additional_results) self.assertEqual(20, get_num_trees(bst)) x_mat = xgb.DMatrix(self.x) pred_y = bst.predict(x_mat) self.assertSequenceEqual(list(self.y), list(pred_y)) print(f"Got correct predictions: {pred_y}") actors = keep_actors["actors"] # First actor does not get recreated self.assertEqual(actors[0], None) self.assertTrue(actors[1]) # Only one worker finished, so n=16 self.assertEqual(additional_results["total_n"], 16)