def load_data_cache(use_test=False, out_ohe=True): test_size = 0.33 valid_size = 0.33 * (1 - test_size) if use_test: print("!!! USING TEST DATA !!!") ( (X_train, y_train), (X_valid, y_valid), (X_test, y_test), categorical_indicator, ) = albert.load_data( random_state=RANDOM_STATE, test_size=test_size, valid_size=valid_size, categoricals_to_integers=True, ) X_train = np.concatenate([X_train, X_valid]) y_train = np.concatenate([y_train, y_valid]) X_valid, y_valid = X_test, y_test else: ( (X_train, y_train), (X_valid, y_valid), _, categorical_indicator, ) = albert.load_data( random_state=RANDOM_STATE, test_size=test_size, valid_size=valid_size, categoricals_to_integers=True, ) # Replace missing values with mean value # https://scikit-learn.org/stable/modules/impute.html print("Replacing missing values") imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean") X_train = imp_mean.fit_transform(X_train) X_valid = imp_mean.transform(X_valid) # Min Max => Std scaler preprocessing for non categorical variables for i, (categorical, _) in enumerate(categorical_indicator): if not categorical: scaler = minmaxstdscaler() X_train[:, i:i + 1] = scaler.fit_transform(X_train[:, i:i + 1]) X_valid[:, i:i + 1] = scaler.transform(X_valid[:, i:i + 1]) # One Hot Encoding of Outputs if out_ohe: prepro_output = preprocessing.OneHotEncoder() y_train = y_train.reshape(-1, 1) y_valid = y_valid.reshape(-1, 1) y_train = prepro_output.fit_transform(y_train).toarray() y_valid = prepro_output.transform(y_valid).toarray() print(f"X_train shape: {np.shape(X_train)}") print(f"y_train shape: {np.shape(y_train)}") print(f"X_valid shape: {np.shape(X_valid)}") print(f"y_valid shape: {np.shape(y_valid)}") return (X_train, y_train), (X_valid, y_valid), categorical_indicator
def load_data_cache(use_test=False): # Random state random_state = np.random.RandomState(seed=42) if use_test: print("!!! USING TEST DATA !!!") (X_train, y_train), (X_valid, y_valid), (X_test, y_test) = airlines.load_data( random_state=random_state, test_size=0.33, valid_size=0.33 * (1 - 0.33)) X_train = np.concatenate([X_train, X_valid]) y_train = np.concatenate([y_train, y_valid]) X_valid, y_valid = X_test, y_test else: (X_train, y_train), (X_valid, y_valid), _ = airlines.load_data( random_state=random_state, test_size=0.33, valid_size=0.33 * (1 - 0.33)) prepro_output = preprocessing.OneHotEncoder() y_train = y_train.reshape(-1, 1) y_valid = y_valid.reshape(-1, 1) y_train = prepro_output.fit_transform(y_train).toarray() y_valid = prepro_output.transform(y_valid).toarray() prepro_input = minmaxstdscaler() X_train = prepro_input.fit_transform(X_train) X_valid = prepro_input.transform(X_valid) print(f"X_train shape: {np.shape(X_train)}") print(f"y_train shape: {np.shape(y_train)}") print(f"X_valid shape: {np.shape(X_valid)}") print(f"y_valid shape: {np.shape(y_valid)}") return (X_train, y_train), (X_valid, y_valid)
def __init__( self, clf=KNeighborsClassifier(n_jobs=4), load_data_func=lambda: load_breast_cancer(return_X_y=True), preproc=minmaxstdscaler(), seed=42, ): super().__init__( clf=clf, load_data_func=load_data_func, preproc=preproc, seed=seed )
def run(config: dict, load_data: callable) -> float: """Run function which can be used for AutoML classification. Args: config (dict): [description] load_data (callable): [description] Returns: float: [description] """ seed = 42 config["random_state"] = seed X, y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed) preproc = minmaxstdscaler() X_train = preproc.fit_transform(X_train) X_test = preproc.transform(X_test) mapping = CLASSIFIERS clf_class = mapping[config["classifier"]] # keep parameters possible for the current classifier sig = signature(clf_class) clf_allowed_params = list(sig.parameters.keys()) clf_params = { k: v for k, v in config.items() if k in clf_allowed_params and not (v in ["nan", "NA"]) } if "n_jobs" in clf_allowed_params: # performance parameter clf_params["n_jobs"] = 8 try: # good practice to manage the fail value yourself... clf = clf_class(**clf_params) clf.fit(X_train, y_train) fit_is_complete = True except: fit_is_complete = False if fit_is_complete: y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) else: acc = -1.0 return acc
def __init__( self, clf=RandomForestRegressor(n_jobs=4, random_state=42), load_data_func=lambda: load_boston(return_X_y=True), preproc=minmaxstdscaler(), seed=42, ): super().__init__(clf=clf, load_data_func=load_data_func, preproc=preproc, seed=seed)
def __init__( self, clf=None, load_data_func=lambda: load_breast_cancer(return_X_y=True), preproc=minmaxstdscaler(), seed=42, ): self.clf = clf self.seed = seed self.load_data_func = load_data_func self.preproc = preproc
def load_data_cache(): # Random state random_state = np.random.RandomState(seed=42) (X_train, y_train), (X_valid, y_valid), _ = covertype.load_data( random_state=random_state ) prepro_output = preprocessing.OneHotEncoder() y_train = y_train.reshape(-1, 1) y_valid = y_valid.reshape(-1, 1) y_train = prepro_output.fit_transform(y_train).toarray() y_valid = prepro_output.transform(y_valid).toarray() prepro_input = minmaxstdscaler() X_train = prepro_input.fit_transform(X_train) X_valid = prepro_input.transform(X_valid) print(f"X_train shape: {np.shape(X_train)}") print(f"y_train shape: {np.shape(y_train)}") print(f"X_valid shape: {np.shape(X_valid)}") print(f"y_valid shape: {np.shape(y_valid)}") return (X_train, y_train), (X_valid, y_valid)
def run_autosklearn1(config: dict, load_data: callable) -> float: """Run function which can be used for AutoML classification. It has to be used with the ``deephyper.sklearn.classifier.problem_autosklearn1`` problem definition which corresponds to: .. code-block:: Configuration space object: Hyperparameters: C, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale alpha, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale classifier, Type: Categorical, Choices: {RandomForest, Logistic, AdaBoost, KNeighbors, MLP, SVC, XGBoost}, Default: RandomForest gamma, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale kernel, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear max_depth, Type: UniformInteger, Range: [2, 100], Default: 14, on log-scale n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 45, on log-scale n_neighbors, Type: UniformInteger, Range: [1, 100], Default: 50 Conditions: (C | classifier == 'Logistic' || C | classifier == 'SVC') (gamma | kernel == 'rbf' || gamma | kernel == 'poly' || gamma | kernel == 'sigmoid') (n_estimators | classifier == 'RandomForest' || n_estimators | classifier == 'AdaBoost') alpha | classifier == 'MLP' kernel | classifier == 'SVC' max_depth | classifier == 'RandomForest' n_neighbors | classifier == 'KNeighbors' Args: config (dict): an hyperparameter configuration ``dict`` corresponding to the ``deephyper.sklearn.classifier.problem_autosklearn1``. load_data (callable): a function returning data as Numpy arrays ``(X, y)``. Returns: float: returns the accuracy on the validation set. """ seed = 42 config["random_state"] = seed X, y = load_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed) preproc = minmaxstdscaler() X_train = preproc.fit_transform(X_train) X_test = preproc.transform(X_test) mapping = CLASSIFIERS clf_class = mapping[config["classifier"]] # keep parameters possible for the current classifier sig = signature(clf_class) clf_allowed_params = list(sig.parameters.keys()) clf_params = { k: v for k, v in config.items() if k in clf_allowed_params and not (v in ["nan", "NA"]) } if "n_jobs" in clf_allowed_params: # performance parameter clf_params["n_jobs"] = 8 try: # good practice to manage the fail value yourself... clf = clf_class(**clf_params) clf.fit(X_train, y_train) fit_is_complete = True except: fit_is_complete = False if fit_is_complete: y_pred = clf.predict(X_test) acc = accuracy_score(y_test, y_pred) else: acc = -1.0 return acc