def get_automl(project_name): """ Retrieve information about an AutoML instance. :param str project_name: A string indicating the project_name of the automl instance to retrieve. :returns: A dictionary containing the project_name, leader model, and leaderboard. """ automl_json = h2o.api("GET /99/AutoML/%s" % project_name) project_name = automl_json["project_name"] leaderboard_list = [key["name"] for key in automl_json['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: leader_id = leaderboard_list[0] else: leader_id = None leader = h2o.get_model(leader_id) # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( automl_json["leaderboard_table"].cell_values, column_names=automl_json["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() leaderboard = leaderboard[1:] automl_dict = {'project_name': project_name, "leader": leader, "leaderboard": leaderboard} return automl_dict
def _fetch(self): res = h2o.api("GET /99/AutoML/" + self.project_name) leaderboard_list = [key["name"] for key in res['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: self._leader_id = leaderboard_list[0] else: self._leader_id = None # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( res["leaderboard_table"].cell_values, column_names=res["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() self._leaderboard = leaderboard[1:] return self._leader_id is not None
def _fetch(self): res = h2o.api("GET /99/AutoML/" + self._automl_key) leaderboard_list = [key["name"] for key in res['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: self._leader_id = leaderboard_list[0] else: self._leader_id = None # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( res["leaderboard_table"].cell_values, column_names=res["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() self._leaderboard = leaderboard[1:] return self._leader_id is not None
def _load_model(path, init=False): path = os.path.abspath(path) with open(os.path.join(path, "h2o.yaml")) as f: params = yaml.safe_load(f.read()) if init: h2o.init(**(params["init"] if "init" in params else {})) h2o.no_progress() return h2o.load_model(os.path.join(path, params['model_file']))
def __init__(self, song_df): h2o.init() h2o.no_progress() self.song_df = song_df.sort_values('song_id') self.song_df_h2o = h2o.H2OFrame( self.song_df.drop(['Song', 'Artist', 'Album'], axis=1)) self.X = [ 'Genre', 'mode', 'tempo', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence' ] self.classifier = H2ORandomForestEstimator(ntrees=200, min_rows=3)
def __init__(self, ip: str = '', port: str = '', settings_file_name: str = 'settings.ini'): """Init.""" self.config = configparser.ConfigParser() self.config.read(settings_file_name, encoding='utf-8') for key, value in self.config['MAIN'].items(): setattr(self, key, value) h2o.connect(ip=ip, port=port, auth=(self.login, self.password), verbose=False) h2o.no_progress()
def h2ono_progress(): """ Python API test: h2o.no_progress() Command is verified by eyeballing the pyunit test output file and make sure the no progress bars are there. Here, we will assume the command runs well if there is no error message. """ try: # only only work with Python 3. s = StringIO() sys.stdout = s # redirect output h2o.no_progress() # true by default. run_test() # make sure the word progress is found and % is found. That is how progress is displayed. assert not s.getvalue( ), "Nothing should have been printed, instead got " + s.getvalue() finally: sys.stdout = sys.__stdout__ # restore old stdout
def h2ono_progress(): """ Python API test: h2o.no_progress() Command is verified by eyeballing the pyunit test output file and make sure the no progress bars are there. Here, we will assume the command runs well if there is no error message. """ try: # only only work with Python 3. s = StringIO() sys.stdout = s # redirect output h2o.no_progress() # true by default. run_test() sys.stdout=sys.__stdout__ # restore old stdout # make sure the word progress is found and % is found. That is how progress is displayed. assert s.getvalue()=="", "Nothing should have been printed, instead got " + s.getvalue() except Exception as e: # may get error for python 2 sys.stdout=sys.__stdout__ # restore old stdout assert s.buf=="", "Nothing should have been printed, instead got " + s.buf
def load( tag: t.Union[str, Tag], init_params: t.Optional[t.Dict[str, t.Any]] = None, model_store: "ModelStore" = Provide[BentoMLContainer.model_store], ) -> h2o.model.model_base.ModelBase: """ Load a model from BentoML local modelstore with given tag. Args: tag (:code:`Union[str, Tag]`): Tag of a saved model in BentoML local modelstore. init_params (:code:`Dict[str, Union[str, Any]]`, `optional`, defaults to `None`): Params for h2o server initialization model_store (:mod:`~bentoml._internal.models.store.ModelStore`, default to :mod:`BentoMLContainer.model_store`): BentoML modelstore, provided by DI Container. Returns: :obj:`h2o.model.model_base.ModelBase`: an instance of `h2o.model.model_base.ModelBase` from BentoML modelstore. Examples: .. code-block:: python import bentoml model = bentoml.h2o.load(tag, init_params=dict(port=54323)) """ # noqa if not init_params: init_params = dict() h2o.init(**init_params) model = model_store.get(tag) if model.info.module not in (MODULE_NAME, __name__): raise BentoMLException( f"Model {tag} was saved with module {model.info.module}, failed loading with {MODULE_NAME}." ) path = model.path_of(SAVE_NAMESPACE) h2o.no_progress() return h2o.load_model(path)
def _load_model(path, init=False): import h2o path = os.path.abspath(path) with open(os.path.join(path, "h2o.yaml")) as f: params = yaml.safe_load(f.read()) if init: h2o.init(**(params["init"] if "init" in params else {})) h2o.no_progress() model_path = os.path.join(path, params["model_file"]) if hasattr(h2o, "upload_model"): model = h2o.upload_model(model_path) else: warnings.warn( "If your cluster is remote, H2O may not load the model correctly. " "Please upgrade H2O version to a newer version") model = h2o.load_model(model_path) return model
def _h2o_init(h2o_init_params): no_progress() if cluster() is None: init(**(h2o_init_params if h2o_init_params is not None else {}))
def starth2o(h2oserver): h2o.init(ip=h2oserver, max_mem_size="28G") # specify max number of bytes. uses all cores by default. h2o.no_progress() h2o.remove_all() # clean slate, in case cluster was already running
import seaborn as sns import h2o from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator from h2o.grid.grid_search import H2OGridSearch from h2o.estimators.xgboost import H2OXGBoostEstimator from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator # import GLM models from h2o.grid.grid_search import H2OGridSearch #import xgboost as xgb h2o.init(max_mem_size='6G') # give h2o as much memory as possible h2o.no_progress() # turn off h2o progress bars # Definitions pd.set_option('display.float_format', lambda x: '%.3f' % x) #%matplotlib inline #njobs = 4 def get_type_lists(frame, rejects): """Creates lists of numeric and categorical variables. :param frame: The frame from which to determine types. :param rejects: Variable names not to be included in returned lists. :return: Tuple of lists for numeric and categorical variables in the frame. """ nums, cats = [], []
def makeDlModel(subOpt=None, xCol=None, yCol=None, inpData=None, modelKey=None): log.info('[START] {}'.format('makeDlModel')) result = None try: saveModel = '{}/{}-{}-{}-{}-{}-{}.model'.format( globalVar['outPath'], serviceName, modelKey, 'final', 'h2o', 'act', '*') saveModelList = sorted(glob.glob(saveModel), reverse=True) xyCol = xCol.copy() xyCol.append(yCol) data = inpData[xyCol] # h2o.shutdown(prompt=False) if (not subOpt['isInit']): h2o.init() h2o.no_progress() subOpt['isInit'] = True # 학습 모델이 없을 경우 if (subOpt['isOverWrite']) or (len(saveModelList) < 1): # 7:3에 대한 학습/테스트 분류 trainData, validData = train_test_split(data, test_size=0.3) # trainData = inpData # dlModel = H2OAutoML(max_models=30, max_runtime_secs=99999, balance_classes=True, seed=123) dlModel = H2OAutoML(max_models=20, max_runtime_secs=99999, balance_classes=True, seed=123) dlModel.train(x=xCol, y=yCol, training_frame=h2o.H2OFrame(trainData), validation_frame=h2o.H2OFrame(validData)) # dlModel.train(x=xCol, y=yCol, training_frame=h2o.H2OFrame(data)) fnlModel = dlModel.get_best_model() # 학습 모델 저장 saveModel = '{}/{}-{}-{}-{}-{}-{}.model'.format( globalVar['outPath'], serviceName, modelKey, 'final', 'h2o', 'act', datetime.now().strftime('%Y%m%d')) log.info('[CHECK] saveModel : {}'.format(saveModel)) os.makedirs(os.path.dirname(saveModel), exist_ok=True) # h2o.save_model(model=fnlModel, path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) fnlModel.save_mojo(path=os.path.dirname(saveModel), filename=os.path.basename(saveModel), force=True) else: saveModel = saveModelList[0] log.info('[CHECK] saveModel : {}'.format(saveModel)) fnlModel = h2o.import_mojo(saveModel) result = { 'msg': 'succ', 'dlModel': fnlModel, 'saveModel': saveModel, 'isExist': os.path.exists(saveModel) } return result except Exception as e: log.error('Exception : {}'.format(e)) return result finally: # try, catch 구문이 종료되기 전에 무조건 실행 log.info('[END] {}'.format('makeDlModel'))
def __init__(self, training_frame=None, X=None, model=None, N=None, discretize=None, quantiles=None, seed=None, print_=None, top_n=None, intercept=None): # mandatory if training_frame is not None: self.training_frame = training_frame else: raise ValueError('Parameter training_frame must be defined.') if X is not None: self.X = X else: raise ValueError('Parameter X must be defined.') if model is not None: self.model = model else: raise ValueError('Parameter model must be defined.') # defaults if N is not None: self.N = N else: self.N = 10000 if discretize is not None: self.discretize = discretize else: self.discretize = None if quantiles is not None: self.quantiles = quantiles else: self.quantiles = 4 if seed is not None: self.seed = seed else: self.seed = 12345 if print_ is not None: self.print_ = print_ else: self.print_ = True if top_n is not None: self.top_n = top_n else: self.top_n = 5 if intercept is not None: self.intercept = intercept else: self.intercept = True # internal storage self.reason_code_values = None self.lime_r2 = None self.lime = None self.lime_pred = None self.bins_dict = {} h2o.no_progress() # do not show h2o progress bars
ntrees_opt = [5, 10, 15] max_depth_opt = [2, 3, 4] learn_rate_opt = [0.1, 0.2] hyper_parameters = {"ntrees": ntrees_opt, "max_depth":max_depth_opt, "learn_rate":learn_rate_opt} from h2o.grid.grid_search import H2OGridSearch gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) gs.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10) print gs.sort_by('logloss', increasing=True) # Pipeline from h2o.transforms.preprocessing import H2OScaler from sklearn.pipeline import Pipeline # Turn off h2o progress bars h2o.__PROGRESS_BAR__=False h2o.no_progress() # build transformation pipeline using sklearn's Pipeline and H2O transforms pipeline = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))]) pipeline.fit(iris_df[:4],iris_df[4]) # Random CV using H2O and Scikit-learn from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": [2,3],
Created on Feb 15, 2016 @author: molina ''' from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.frame import H2OFrame import numpy import re from mb.modelbase import numpytoordereddict from ..mlutils.statistics import logpoissonpmf import h2o h2o.init() h2o.no_progress() class GBMPDN: def __init__(self, data, features, families="poisson", max_depth=10, iterations=1): self.data = data self.nD = data.shape[0] self.nF = data.shape[1] self.config = {"max_depth": max_depth, "iterations": iterations}
def user_identification(): cfg = get_config('h2o_different_nb_cls.cfg') h2o_cfg = get_config('h2o.cfg') h2o.init(nthreads=h2o_cfg.getint('h2o', 'nthreads'), max_mem_size=h2o_cfg.get('h2o', 'max_mem')) h2o.no_progress() h2o_seed = h2o_cfg.getint('h2o', 'seed') logger.info('intrusion_detection_synthetic') folder = cfg.get('data', 'path') if cfg.has_option('data', 'output_path'): output_folder = cfg.get('data', 'output_path') out_folder = path.join(output_folder, "H2O_" + timestamp()) else: out_folder = path.join(folder, "H2O_" + timestamp()) ensure_dir(out_folder) with open(path.join(out_folder, 'config.cfg'), 'w') as f: cfg.write(f) np.random.seed(cfg.getint('misc', 'random_seed')) # ignored columns, by name or prefix if cfg.has_option('data', 'ignored_columns'): ignored_columns = cfg.get('data', 'ignored_columns').split(',') else: ignored_columns = [] if cfg.has_option('data', 'ignore_columns_starting_with'): ignore_columns_starting_with = cfg.get( 'data', 'ignore_columns_starting_with').split(',') else: ignore_columns_starting_with = [] ignored_columns_reason = dict() n_folds = cfg.getint('data', 'n_folds') min_users = cfg.getint('data', 'min_users') max_users = cfg.getint('data', 'max_users') step = cfg.getint('data', 'step') train_frame = cfg.get('data', 'train') test_frame = cfg.get('data', 'test') logger.info('Out folder: %s' % out_folder) cluster_dir = path.join(out_folder, 'clusters') ensure_dir(cluster_dir) # print and check features on train set train_df = pd.read_csv(path.join(folder, train_frame)) test_df = pd.read_csv(path.join(folder, test_frame)) important_features = [ 'pc', 'http_count', 'session_length', 'session_end_hour', 'http_avg_duration', 'email_count', 'session_start_minute', 'session_start_hour', 'user' ] if set(important_features).issubset(set(train_df.columns)): print('OK') train_df = train_df[important_features] test_df = test_df[important_features] column_types = get_h2o_column_types(list(train_df)) unique_users = np.unique(train_df['user'].unique()) logger.info('total unique users: %d' % unique_users.shape[0]) for c in column_types: if ignore_columns_starting_with and c.startswith( tuple(ignore_columns_starting_with)): ignored_columns.append(c) ignored_columns_reason[c] = 'ignored by prefix' ignored_columns.append('user') ignored_columns.append('is_anomaly') columns_to_keep = [ i for i in list(train_df.columns) if i not in ignored_columns ] # End of configuration # Preparation of the variables classification_types = [ "meta_binary_tree_classifier", "random", "huffman", "balanced-tree", "meta-binary-tree-encoding", "standard-classifier" ] standard_dictionary = {} for i in classification_types: standard_dictionary[i] = [] metrics_dict = {} for metric in METRICS: metrics_dict[metric] = deepcopy(standard_dictionary) training_time_dict = deepcopy(standard_dictionary) prediction_time_dict = deepcopy(standard_dictionary) number_of_users = [i for i in range(min_users, max_users + 1, step)] logger.info("number_of_users = " + str(number_of_users)) for n_users in number_of_users: logger.info("____________________________________") logger.info("DATA FOR %d CLASSES" % n_users) n_metrics_dict = {} for metric in METRICS: n_metrics_dict[metric] = deepcopy(standard_dictionary) n_train_time = deepcopy(standard_dictionary) n_predict_time = deepcopy(standard_dictionary) rf = RandomForest(seed=h2o_seed, ntrees=cfg.getint('random_forest', 'ntrees'), max_depth=cfg.getint('random_forest', 'max_depth'), categorical_encoding=cfg.get('random_forest', 'categorical_encoding'), nbins_cats=cfg.getint('random_forest', 'nbins_cats'), histogram_type=cfg.get('random_forest', 'histogram_type')) for i in range(n_folds): logger.info("++++++++++++++++++++++++++++++++++") logger.info("Fold %d" % (i + 1)) fold_users = np.random.choice(unique_users, n_users, replace=False) x_train_fold = train_df.loc[train_df['user'].isin( fold_users)].reset_index(drop=True) x_test_fold = test_df.loc[test_df['user'].isin( fold_users)].reset_index(drop=True) y_train = x_train_fold['user'] y_test = x_test_fold['user'] x_train_fold = x_train_fold[columns_to_keep] x_test_fold = x_test_fold[columns_to_keep] while len(y_test) == 0: fold_users = np.random.choice(unique_users, n_users, replace=False) x_train_fold = train_df.loc[train_df['user'].isin( fold_users)].reset_index(drop=True) x_test_fold = test_df.loc[test_df['user'].isin( fold_users)].reset_index(drop=True) y_train = x_train_fold['user'] y_test = x_test_fold['user'] x_train_fold = x_train_fold[columns_to_keep] x_test_fold = x_test_fold[columns_to_keep] temp_metrics_dict, train, test = fold_prediction_result( x_train_fold, y_train, x_test_fold, y_test, classification_types, rf) for classification in classification_types: for metric in METRICS: n_metrics_dict[metric][classification].append( temp_metrics_dict[metric][classification]) n_train_time[classification].append(train[classification]) n_predict_time[classification].append(test[classification]) h2o.remove_all() for classification in classification_types: logger.info("***___***___***___***___***___") logger.info("Average data for %s" % classification) for metric in METRICS: avg = np.average(n_metrics_dict[metric][classification]) metrics_dict[metric][classification].append(avg) logger.info("Average %s for %s for %d users: %2.2f" % (metric, classification, n_users, avg)) avg = np.average(n_train_time[classification]) training_time_dict[classification].append(avg) logger.info( "Average training time for %s for %d users : %d min %d s " % (classification, n_users, avg // 60, avg % 60)) avg = np.average(n_predict_time[classification]) prediction_time_dict[classification].append(avg) logger.info( "Average prediction time for %s for %d users : %d min %d s " % (classification, n_users, avg // 60, avg % 60)) for metric in METRICS: logger.info("%s = %s" % (metric, metrics_dict[metric])) logger.info("training_time = " + str(training_time_dict)) logger.info("prediction_time = " + str(prediction_time_dict)) metrics_dict["training time (seconds)"] = training_time_dict metrics_dict["prediction time (seconds)"] = prediction_time_dict # Plot generation for classification in classification_types: fig = plt.figure(figsize=(15, 30)) for i, score in enumerate(metrics_dict.keys()): ax = fig.add_subplot(len(set(metrics_dict)) // 2 + 1, 2, i + 1) ax.plot(number_of_users, metrics_dict[score][classification]) plt.title("Classification %s for %s" % (score, classification)) plt.xlabel("Number of classes") plt.ylabel(score) plt.savefig(path.join(out_folder, classification)) plt.close(fig) fig = plt.figure(figsize=(15, 30)) for i, score in enumerate(metrics_dict.keys()): ax = fig.add_subplot(len(set(metrics_dict)) // 2 + 1, 2, i + 1) for classification in classification_types: ax.plot(number_of_users, metrics_dict[score][classification]) plt.title("Classification %s" % score) plt.legend(classification_types, loc="upper right") plt.xlabel("Number of classes") plt.ylabel(score) plt.savefig(path.join(out_folder, "all")) plt.close(fig)