def _map_to_lgb_ray_params(params: Dict[str, Any]) -> Dict[str, Any]: from lightgbm_ray import RayParams ray_params = {} for key, value in params.items(): if key == "num_workers": ray_params["num_actors"] = value elif key == "resources_per_worker": if "CPU" in value: ray_params["cpus_per_actor"] = value["CPU"] if "GPU" in value: ray_params["gpus_per_actor"] = value["GPU"] ray_params = RayParams(**ray_params) ray_params.allow_less_than_two_cpus = True return ray_params
from lightgbm_ray import RayParams from ray.util.lightgbm.release_test_util import ( train_ray, FailureState, FailureInjection, TrackingCallback, ) if __name__ == "__main__": ray.init(address="auto") failure_state = FailureState.remote() ray_params = RayParams(max_actor_restarts=2, num_actors=4, cpus_per_actor=4, gpus_per_actor=0) _, additional_results, _ = train_ray( path="/data/classification.parquet", num_workers=None, num_boost_rounds=100, num_files=200, regression=False, use_gpu=False, ray_params=ray_params, lightgbm_params=None, callbacks=[ TrackingCallback(), FailureInjection(id="first_fail", state=failure_state,
from ray.util.lightgbm.release_test_util import train_ray if __name__ == "__main__": addr = os.environ.get("RAY_ADDRESS") job_name = os.environ.get("RAY_JOB_NAME", "train_small") if addr.startswith("anyscale://"): ray.init(address=addr, job_name=job_name) else: ray.init(address="auto") output = os.environ["TEST_OUTPUT_JSON"] ray_params = RayParams( elastic_training=False, max_actor_restarts=2, num_actors=4, cpus_per_actor=4, gpus_per_actor=0, ) start = time.time() @ray.remote(num_cpus=0) def train(): os.environ["TEST_OUTPUT_JSON"] = output train_ray( path="/data/classification.parquet", num_workers=4, num_boost_rounds=100, num_files=25, regression=False,
lightgbm_params=config, ) if __name__ == "__main__": search_space = { "eta": tune.loguniform(1e-4, 1e-1), "subsample": tune.uniform(0.5, 1.0), "max_depth": tune.randint(1, 9) } ray.init(address="auto") ray_params = RayParams(elastic_training=False, max_actor_restarts=2, num_actors=32, cpus_per_actor=1, gpus_per_actor=0) start = time.time() analysis = tune.run(tune.with_parameters(train_wrapper, ray_params=ray_params), config=search_space, num_samples=4, resources_per_trial=ray_params.get_tune_resources()) taken = time.time() - start result = { "time_taken": taken, "trial_states": dict(Counter([trial.status for trial in analysis.trials]))
# Create Classification version of target variable df['goodquality'] = [1 if x >= 6 else 0 for x in df['quality']] X = df.drop(['quality', 'goodquality'], axis=1) y = df['goodquality'] print(df['goodquality'].value_counts()) # Normalize feature variables X_features = X X = StandardScaler().fit_transform(X) # Splitting the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0) model = RayLGBMClassifier( # n_jobs=4, # In LightGBM-Ray, n_jobs sets the number of actors random_state=42) start = time.time() model.fit(X=X_train, y=y_train, ray_params=RayParams(num_actors=3)) print(f"executed LightGBM in {time.time() - start}") y_pred = model.predict(X_test) #converting probabilities into 0 or 1 for i in range(len(y_pred)): if y_pred[i] >= .5: # setting threshold to .5 y_pred[i] = 1 else: y_pred[i] = 0 print(classification_report(y_test, y_pred))
def train_ray(path, num_workers, num_boost_rounds, num_files=0, regression=False, use_gpu=False, ray_params=None, lightgbm_params=None, **kwargs): path = os.path.expanduser(path) if not os.path.exists(path): raise ValueError(f"Path does not exist: {path}") if num_files: files = sorted(glob.glob(f"{path}/**/*.parquet")) while num_files > len(files): files = files + files path = files[0:num_files] use_device_matrix = False if use_gpu: try: import cupy # noqa: F401 use_device_matrix = True except ImportError: use_device_matrix = False if use_device_matrix: dtrain = RayDeviceQuantileDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) else: dtrain = RayDMatrix(path, num_actors=num_workers, label="labels", ignore=["partition"], filetype=RayFileType.PARQUET) config = {"device": "cpu" if not use_gpu else "gpu"} if not regression: # Classification config.update({ "objective": "binary", "metric": ["binary_logloss", "binary_error"], }) else: # Regression config.update({ "objective": "regression", "metric": ["l2", "rmse"], }) if lightgbm_params: config.update(lightgbm_params) start = time.time() evals_result = {} additional_results = {} bst = train(config, dtrain, evals_result=evals_result, additional_results=additional_results, num_boost_round=num_boost_rounds, ray_params=ray_params or RayParams(max_actor_restarts=2, num_actors=num_workers, cpus_per_actor=2, gpus_per_actor=0 if not use_gpu else 1), evals=[(dtrain, "train")], **kwargs) taken = time.time() - start print(f"TRAIN TIME TAKEN: {taken:.2f} seconds") out_file = os.path.expanduser( "~/benchmark_{}.lgbm".format("cpu" if not use_gpu else "gpu")) bst.booster_.save_model(out_file) print("Final training error: {:.4f}".format( evals_result["train"]["binary_error" if not regression else "rmse"] [-1])) return bst, additional_results, taken