def run(config_path, work_station): config = import_module(config_path) # Import directories paths = get_paths(station=work_station) data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir'] h2o_rand_dir, log_dir = config.SAVE_DIR, paths['logs'] # Get new logger logger = get_logger('H2oRandSearch', log_dir) meta = pd.read_pickle(pkl_dir + '/meta_df.pkl') h2o.init(**config.H2O_INIT_SETTINGS) logger.info("Started new H2o session " + str(h2o.cluster().cloud_name)) credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv") logger.info("Loaded data into cluster") # Grid searching parameters X = set(credit_data.columns) - {'TARGET'} - set(meta.columns) Y = 'TARGET' credit_data[Y] = credit_data[Y].asfactor() data_info = { 'X': X, 'Y': Y, 'training_frame': credit_data, 'model_directory': h2o_rand_dir, 'logger': logger, 'configuration': config } del meta if config.INCLUDE_GBM: config.GBM_SETTINGS.update(data_info) random_h2o_model_search(**config.GBM_SETTINGS) if config.INCLUDE_XGB: config.XGB_SETTINGS.update(data_info) random_h2o_model_search(**config.XGB_SETTINGS) if config.INCLUDE_DEEP: config.DEEP_SETTINGS.update(data_info) random_h2o_model_search(**config.DEEP_SETTINGS) if config.INCLUDE_RF: config.RF_SETTINGS.update(data_info) random_h2o_model_search(**config.RF_SETTINGS) if config.INCLUDE_NAIVE_BAYES: config.NAI_BAYES_SETTINGS.update(data_info) random_h2o_model_search(**config.NAI_BAYES_SETTINGS) if config.INCLUDE_GLM: config.GLM_SETTINGS.update(data_info) random_h2o_model_search(**config.GLM_SETTINGS) logger.info("Completed search. Shutting down cluster " + str(h2o.cluster().cloud_name)) h2o.cluster().shutdown()
""" TPOT is built on top of several existing Python libraries, including: NumPy, SciPy, scikit-learn, DEAP,update_checker, tqdm, stopit, pandas and xgboost. """ from tpot import TPOTClassifier from sklearn.model_selection import train_test_split import numpy as np from kaggleProjects.directory_table import get_paths seed = 321 paths = get_paths(station='Subgraph') data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir'] train_df = np.load(pkl_dir + r'\train_df.npy') target = np.load(pkl_dir + r'\target.npy') predicting_df = np.load(pkl_dir + r'\predict_df.npy') # Create a validation set to check training performance X_train, X_valid, y_train, y_valid = train_test_split(train_df, target, test_size=0.1, random_state=seed, stratify=target[:, 0]) y_train = y_train[:, 0] tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, scoring='roc_auc', n_jobs=-1,
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator from kaggleProjects.DefaultRisk.H2oRandSearch.Search_Configurations import config from kaggleProjects.directory_table import get_paths from kaggleProjects.logger_factory import get_logger def load_models_from_dir(saved_models_dir): models = [] for model_dir in listdir(saved_models_dir): directory = saved_models_dir + '/' + model_dir + '/' models += [h2o.load_model(directory + listdir(directory)[0])] return models # Import directories paths = get_paths(station=config.WORK_STATION) pkl_dir, submit_dir = paths['pkl_dir'], paths['submissions'] h2o_rand_dir, log_dir = paths['h2o_rand_search'], paths['logs'] # Initiate logger logger = get_logger('ensemble', log_dir) h2o.init(**config.H2O_INIT_SETTINGS) ensemble_models = load_models_from_dir(h2o_rand_dir) # Load data meta = pd.read_pickle(pkl_dir + '/meta_df.pkl') logger.info("Started new H2o session " + str(h2o.cluster().cloud_name)) credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv") predict_me = h2o.upload_file(pkl_dir + '/test_imp_na_df.csv') logger.info("Loaded data into cluster")