def load_data_cache(use_test=False): # Random state random_state = np.random.RandomState(seed=42) if use_test: print("!!! USING TEST DATA !!!") (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = airlines.load_data( random_state=random_state, test_size=0.33, valid_size=0.33 * (1 - 0.33)) X_train = np.concatenate([X_train, X_valid]) y_train = np.concatenate([y_train, y_valid]) X_valid, y_valid = X_test, y_test else: (X_train, y_train), (X_valid, y_valid), _ = airlines.load_data( random_state=random_state, test_size=0.33, valid_size=0.33 * (1 - 0.33)) prepro_output = preprocessing.OneHotEncoder() y_train = y_train.reshape(-1, 1) y_valid = y_valid.reshape(-1, 1) y_train = prepro_output.fit_transform(y_train).toarray() y_valid = prepro_output.transform(y_valid).toarray() prepro_input = minmaxstdscaler() X_train = prepro_input.fit_transform(X_train) X_valid = prepro_input.transform(X_valid) print(f"X_train shape: {np.shape(X_train)}") print(f"y_train shape: {np.shape(y_train)}") print(f"X_valid shape: {np.shape(X_valid)}") print(f"y_valid shape: {np.shape(y_valid)}") return (X_train, y_train), (X_valid, y_valid)
def test_config(config): import numpy as np from sklearn.utils import check_random_state from sklearn.ensemble import RandomForestClassifier from deephyper.benchmark.datasets import airlines as dataset rs_data = np.random.RandomState(seed=42) ratio_test = 0.33 ratio_valid = (1 - ratio_test) * 0.33 train, valid, test = dataset.load_data(random_state=rs_data, test_size=ratio_test, valid_size=ratio_valid) rs_classifier = check_random_state(42) classifier = RandomForestClassifier(n_jobs=8, random_state=rs_classifier, **config) classifier.fit(*train) acc_train = classifier.score(*train) acc_valid = classifier.score(*valid) acc_test = classifier.score(*test) print(f"Accuracy on Training: {acc_train:.3f}") print(f"Accuracy on Validation: {acc_valid:.3f}") print(f"Accuracy on Testing: {acc_test:.3f}")
def load_data(): # In this case passing a random state is critical to make sure # that the same data are loaded all the time and that the test set # is not mixed with either the training or validation set. # It is important to not avoid setting a global seed for safety reasons. random_state = np.random.RandomState(seed=42) # Proportion of the test set on the full dataset ratio_test = 0.33 # Proportion of the valid set on "dataset \ test set" # here we want the test and validation set to have same number of elements ratio_valid = (1 - ratio_test) * 0.33 # The 3rd result is ignored with "_" because it corresponds to the test set # which is not interesting for us now. (X_train, y_train), (X_valid, y_valid), _ = dataset.load_data(random_state=random_state, test_size=ratio_test, valid_size=ratio_valid) # Uncomment the next line if you want to sub-sample the training data to speed-up # the search, "n_samples" controls the size of the new training data # X_train, y_train = resample(X_train, y_train, n_samples=int(1e4)) print(f"X_train shape: {np.shape(X_train)}") print(f"y_train shape: {np.shape(y_train)}") print(f"X_valid shape: {np.shape(X_valid)}") print(f"y_valid shape: {np.shape(y_valid)}") return (X_train, y_train), (X_valid, y_valid)
def test_load_data_airlines(): from deephyper.benchmark.datasets import airlines import numpy as np names = ["train", "valid", "test "] data = airlines.load_data(random_state=42, summary=True) for (X, y), subset_name in zip(data, names): print( f"X_{subset_name} shape: ", np.shape(X), f", y_{subset_name} shape: ", np.shape(y), )
from dhproj.advanced_hpo.mapping import CLASSIFIERS from deephyper.benchmark.datasets import airlines as dataset from sklearn.utils import check_random_state rs_clf = check_random_state(42) rs_data = check_random_state(42) ratio_test = 0.33 ratio_valid = (1 - ratio_test) * 0.33 train, valid, test = dataset.load_data( random_state=rs_data, test_size=ratio_test, valid_size=ratio_valid, ) for clf_name, clf_class in CLASSIFIERS.items(): print(clf_name) clf = clf_class(random_state=rs_clf) clf.fit(*train) acc_train = clf.score(*train) acc_valid = clf.score(*valid) acc_test = clf.score(*test) print(f"Accuracy on Training: {acc_train:.3f}") print(f"Accuracy on Validation: {acc_valid:.3f}") print(f"Accuracy on Testing: {acc_test:.3f}\n")
def load_data(): train, valid, _ = airlines.load_data(random_state=42) return train, valid