コード例 #1
0
def run(config_path, work_station):
    config = import_module(config_path)

    # Import directories
    paths = get_paths(station=work_station)
    data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir']
    h2o_rand_dir, log_dir = config.SAVE_DIR, paths['logs']
    # Get new logger
    logger = get_logger('H2oRandSearch', log_dir)

    meta = pd.read_pickle(pkl_dir + '/meta_df.pkl')
    h2o.init(**config.H2O_INIT_SETTINGS)
    logger.info("Started new H2o session " + str(h2o.cluster().cloud_name))
    credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv")
    logger.info("Loaded data into cluster")
    # Grid searching parameters
    X = set(credit_data.columns) - {'TARGET'} - set(meta.columns)
    Y = 'TARGET'
    credit_data[Y] = credit_data[Y].asfactor()

    data_info = {
        'X': X,
        'Y': Y,
        'training_frame': credit_data,
        'model_directory': h2o_rand_dir,
        'logger': logger,
        'configuration': config
    }
    del meta

    if config.INCLUDE_GBM:
        config.GBM_SETTINGS.update(data_info)
        random_h2o_model_search(**config.GBM_SETTINGS)
    if config.INCLUDE_XGB:
        config.XGB_SETTINGS.update(data_info)
        random_h2o_model_search(**config.XGB_SETTINGS)
    if config.INCLUDE_DEEP:
        config.DEEP_SETTINGS.update(data_info)
        random_h2o_model_search(**config.DEEP_SETTINGS)
    if config.INCLUDE_RF:
        config.RF_SETTINGS.update(data_info)
        random_h2o_model_search(**config.RF_SETTINGS)
    if config.INCLUDE_NAIVE_BAYES:
        config.NAI_BAYES_SETTINGS.update(data_info)
        random_h2o_model_search(**config.NAI_BAYES_SETTINGS)
    if config.INCLUDE_GLM:
        config.GLM_SETTINGS.update(data_info)
        random_h2o_model_search(**config.GLM_SETTINGS)
    logger.info("Completed search. Shutting down cluster " + str(h2o.cluster().cloud_name))
    h2o.cluster().shutdown()
コード例 #2
0
"""
TPOT is built on top of several existing Python libraries, including:
NumPy, SciPy, scikit-learn, DEAP,update_checker, tqdm, stopit, pandas
and xgboost.
"""
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import numpy as np
from kaggleProjects.directory_table import get_paths

seed = 321

paths = get_paths(station='Subgraph')
data_dir, pkl_dir = paths['data_dir'], paths['pkl_dir']
train_df = np.load(pkl_dir + r'\train_df.npy')
target = np.load(pkl_dir + r'\target.npy')
predicting_df = np.load(pkl_dir + r'\predict_df.npy')

# Create a validation set to check training performance
X_train, X_valid, y_train, y_valid = train_test_split(train_df,
                                                      target,
                                                      test_size=0.1,
                                                      random_state=seed,
                                                      stratify=target[:, 0])

y_train = y_train[:, 0]
tpot = TPOTClassifier(generations=5,
                      population_size=20,
                      verbosity=2,
                      scoring='roc_auc',
                      n_jobs=-1,
コード例 #3
0
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from kaggleProjects.DefaultRisk.H2oRandSearch.Search_Configurations import config
from kaggleProjects.directory_table import get_paths
from kaggleProjects.logger_factory import get_logger


def load_models_from_dir(saved_models_dir):
    models = []
    for model_dir in listdir(saved_models_dir):
        directory = saved_models_dir + '/' + model_dir + '/'
        models += [h2o.load_model(directory + listdir(directory)[0])]
    return models


# Import directories
paths = get_paths(station=config.WORK_STATION)
pkl_dir, submit_dir = paths['pkl_dir'], paths['submissions']
h2o_rand_dir, log_dir = paths['h2o_rand_search'], paths['logs']
# Initiate logger
logger = get_logger('ensemble', log_dir)

h2o.init(**config.H2O_INIT_SETTINGS)

ensemble_models = load_models_from_dir(h2o_rand_dir)
# Load data
meta = pd.read_pickle(pkl_dir + '/meta_df.pkl')
logger.info("Started new H2o session " + str(h2o.cluster().cloud_name))
credit_data = h2o.upload_file(pkl_dir + "/train_imp_na_df.csv")
predict_me = h2o.upload_file(pkl_dir + '/test_imp_na_df.csv')
logger.info("Loaded data into cluster")