N_SAMPLES = np.array([1e5, 1e6, 1e7], dtype=int) N_FEATURES = np.array([1, 5, 10], dtype=int) # Create several array for the data if dataset == 'random': array_data = [ misc.generate_samples(ns, nf, RND_SEED) for ns in N_SAMPLES for nf in N_FEATURES ] elif dataset == 'cover_type': array_data = [misc.load_cover_type(RND_SEED)] elif dataset == 'higgs': # Select a subset of samples array_data = [ misc.load_higgs(random_state=RND_SEED, n_samples=ns) for ns in N_SAMPLES ] else: raise ValueError('The dataset is not known. The possible choices are:' ' random') # Save only the time for the moment res_lgbm = [[data[0].shape, p, misc.bench(bench_lgbm, data, n=n_try, **p)] for p in params_list for data in array_data] # Check that the path is existing if not os.path.exists(store_dir): os.makedirs(store_dir)
from sklearn.model_selection import ParameterGrid sys.path.insert(0, '../datasets') from misc import load_higgs configuration_path = "../params_benchmark/parameters_higgs.conf" config_name = 'lightgbm' with open(configuration_path, 'r') as stream: params = yaml.load(stream)[config_name] params = { key: (value if isinstance(value, list) else [value]) for key, value in params.items() } params_grid = list(ParameterGrid(params)) params_selected = [d for d in params_grid if d['max_depth'] == 8][0] N_SAMPLES = 1e7 data = load_higgs(random_state=42, n_samples=int(N_SAMPLES)) # Extract the parameter required for the dataset max_bin = params_selected.pop('max_bin') lgbm_training = lgb.Dataset(data[0], label=data[1], max_bin=max_bin) n_est = params_selected.pop('n_estimators') # Create the number of leafs depending of the max depth params_selected['num_leaves'] = np.power(2, params_selected['max_depth'] - 1) # Do not limit the depth of the trees params_selected['max_depth'] = -1 gbm = lgb.train(params_selected, lgbm_training, num_boost_round=n_est)