def pca_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print( "Testing to see whether the trained PCA are essentially the same using different implementation..." ) eigenvector_standard = None for impl in [ "MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX", "JAMA" ]: print("Run PCA with implementation: " + impl) model = H2OPCA(k=4, pca_impl=impl, seed=1234) model.train(x=list(range(4)), training_frame=arrestsH2O) eigenvectors = model._model_json["output"]["eigenvectors"] if eigenvector_standard is not None: # Compare to see if they are fundamentally the same pyunit_utils.assert_H2OTwoDimTable_equal( eigenvector_standard, eigenvectors, model._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) else: eigenvector_standard = eigenvectors
def glrm_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) pca_h2o = H2OPCA(k=4, transform="STANDARDIZE") pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O) pca_h2o.summary() pca_h2o.show() print("H2O GLRM on standardized data with quadratic loss:\n") glrm_h2o = H2OGeneralizedLowRankEstimator(k=4, transform="STANDARDIZE", loss="Quadratic", gamma_x=0, gamma_y=0, init="SVD", recover_svd=True) glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) glrm_h2o.show() # compare table values and make sure they are the same between PCA and GLRM assert pyunit_utils.equal_2d_tables(pca_h2o._model_json["output"]["importance"]._cell_values, glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \ "PCA and GLRM variance metrics do not agree. Fix it please." sys.stdout.flush()
def screeplot_test(): kwargs = {} kwargs['server'] = True australia = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = H2OPCA(k=4, transform="STANDARDIZE") australia_pca.train(x=list(range(8)), training_frame=australia) australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs)
def pca_pubdev_4167_OOM(): """ This pyunit is written to make sure PCA works with customer data. It is mainly used by customer to verify PCA operations and not to be used as a regular test since I do not want to expose customer data. """ h2o.remove_all() transform_types = [ "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE" ] # make sure we check all tranforms transformN = transform_types[randint(0, len(transform_types) - 1)] print("transform used on dataset is {0}.\n".format(transformN)) training_data = h2o.import_file(path=pyunit_utils.locate( "/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar") ) # Nidhi: import may not work gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN) gramSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) powerSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN, pca_method="Power") powerSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) # compare singular values and stuff between power and GramSVD methods print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVDPCA._model_json["output"]["importance"], powerSVDPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-5, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["names"], tolerance=1e-1, check_sign=True)
def pca_prostate(): print("Importing prostate.csv data...\n") prostate = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors") prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() fitPCA = H2OPCA(k=1, transform="NONE", pca_method="Power", seed=1234) fitPCA.train(x=list(range(2, 9)), training_frame=prostate) fitPCA_noK = H2OPCA(transform="NONE", pca_method="Power", seed=1234) fitPCA_noK.train(x=list(range(2, 9)), training_frame=prostate) pred = fitPCA.predict(prostate) predNoK = fitPCA_noK.predict(prostate) pyunit_utils.compare_frames_local(pred, predNoK, prob=1, tol=1e-10)
def pca_max_k(): data = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip")) x = list(set(data.names)) pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100) pcaGramSVD.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1) correctEigNum = pcaPower.full_parameters["k"]["actual_value"] gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1 powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + gramSVDNum + "." assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + powerNum + "." pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", impute_missing=True, max_iterations=100, seed=12345) pcaRandomized.train(x, training_frame=data) # eigenvalues between the PCA and Randomize should be close, I hope... print("@@@@@@ Comparing eigenvalues between Randomized and Power PCA...\n") pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"]) pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, max_iterations=100, seed=12345) pcaGLRM.train(x, training_frame=data) correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"] glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + glrmNum + "."
def pca_wideDataset_rotterdam_pcapower(): tol = 2e-5 h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) print("------ Testing Power PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform="STANDARDIZE", seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) powerPCA = H2OPCA(k=8, impute_missing=True, transform="STANDARDIZE", pca_method="Power", seed=12345) # power powerPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-6, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=tol, check_sign=True, check_all=False)
def pca_arrests(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() for i in range(4): print("H2O PCA with " + str(i) + " dimensions:\n") print("Using these columns: {0}".format(arrestsH2O.names)) pca_h2o = H2OPCA(k=i + 1) pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
def pca_pubdev_4314(): print("Importing prostate_cat.csv data...\n") prostate = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate.describe() print("PCA with k = 3, retx = FALSE, transform = 'STANDARDIZE'") fitPCA = H2OPCA(k=3, transform="StANDARDIZE", pca_method="GramSVD") fitPCA.train(x=list(range(0,8)), training_frame=prostate) print(fitPCA.summary()) varimpPandas = fitPCA.varimp(use_pandas=True) assert_is_type(varimpPandas, DataFrame) varimpList = fitPCA.varimp() print(varimpList) assert_is_type(varimpList, list) sys.stdout.flush()
def pca_wideDataset_rotterdam_glrm(): tol = 2e-5 h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) # special test with GLRM. Need use_all_levels to be true print("------ Testing GLRM PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform="DEMEAN", seed=12345, use_all_factor_levels=True) gramSVD.train(x=x, training_frame=rotterdamH2O) glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform="DEMEAN", seed=12345, init="Random", recover_svd=True, regularization_x="None", regularization_y="None", max_iterations=11) glrmPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print( "@@@@@@ Comparing eigenvectors and eigenvalues between GramSVD and GLRM...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1, check_all=False) # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=tol, check_sign=True, check_all=False)
def pca_scoring(): print("Importing arrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Run PCA with transform = 'DEMEAN'") fitH2O = H2OPCA(k=4, transform="DEMEAN") fitH2O.train(x=list(range(4)), training_frame=arrestsH2O) # TODO: fitH2O.show() print("Project training data into eigenvector subspace") predH2O = fitH2O.predict(arrestsH2O) print("H2O Projection:") predH2O.head()
def glrm_iris(): print("Importing iris.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv")) irisH2O.describe() print("@@@@@@ Building PCA with GramSVD...\n") glrmPCA = H2OPCA(k=5, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, seed=21) glrmPCA.train(x=irisH2O.names, training_frame=irisH2O) glrm_h2o = H2OGeneralizedLowRankEstimator(k=5, loss="Quadratic", transform="STANDARDIZE", recover_svd=True, seed=21) glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal( glrmPCA._model_json["output"]["importance"], glrm_h2o._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-6) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( glrmPCA._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["eigenvectors"], glrm_h2o._model_json["output"]["names"], tolerance=1e-6, check_sign=True) # check to make sure maximum proportional variance <= 1 assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \ "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
def pca_prostate(): print("Importing prostate.csv data...\n") prostate = h2o.upload_file( pyunit_utils.locate("smalldata/logreg/prostate.csv")) print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors") prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print( "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" ) fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power") fitPCA.train(x=list(range(2, 9)), training_frame=prostate) pred = fitPCA.predict(prostate) print("Projection matrix:\n") pred.head()
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs is set to be too short. See PUBDEV-4802. ''' global model_within_max_runtime seed = 12345 # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) # PCA, pca_method=Power training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=Randomized model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Randomized", compute_metrics=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # PCA, pca_method=GLRM model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([model, training1_data]) # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE", recover_svd=True) grabRuntimeInfo(model, training1_data, x_indices) cleanUp([training1_data, model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
def test_PCA_grid_search_over_params(self): """ test_pca_grid_search_over_params: test for condition 1 and performs the following: a. build H2O PCA models using grid search. Count and make sure models are only built for hyper-parameters set to legal values. No model is built for bad hyper-parameters values. We should instead get a warning/error message printed out. c. For each model built using grid search, we will extract the parameters used in building that model and manually build a H2O PCA model. Training metrics are calculated from the gridsearch model and the manually built model. If their metrics differ by too much, print a warning message but don't fail the test. d. we will check and make sure the models are built within the max_runtime_secs time limit that was set for it as well. If max_runtime_secs was exceeded, declare test failure. """ print("*******************************************************************************************") print("test_PCA_grid_search_over_params for PCA ") h2o.cluster_info() try: print("Hyper-parameters used here is {0}".format(self.final_hyper_params)) # start grid search grid_model = H2OGridSearch(H2OPCA(pca_method=self.pca_method), hyper_params=self.final_hyper_params) grid_model.train(x=self.x_indices, training_frame=self.training1_data) self.correct_model_number = len(grid_model) # store number of models built # make sure the correct number of models are built by gridsearch if (self.correct_model_number - self.possible_number_models)>0.9: # wrong grid model number self.test_failed += 1 print("test_PCA_grid_search_over_params for PCA failed: number of models built by gridsearch: {0} " "does not equal to all possible combinations of hyper-parameters: " "{1}".format(self.correct_model_number, self.possible_number_models)) else: # add parameters into params_dict. Use this to manually build model params_dict = dict() params_dict["pca_method"] = self.pca_method total_run_time_limits = 0.0 # calculate upper bound of max_runtime_secs true_run_time_limits = 0.0 manual_run_runtime = 0.0 # compare performance metric of model built by gridsearch with manually built model for each_model in grid_model: params_list = grid_model.get_hyperparams_dict(each_model._id) params_list.update(params_dict) model_params = dict() # need to taken out max_runtime_secs from model parameters, it is now set in .train() if "max_runtime_secs" in params_list: model_params["max_runtime_secs"] = params_list["max_runtime_secs"] max_runtime = params_list["max_runtime_secs"] del params_list["max_runtime_secs"] else: max_runtime = 0 # make sure manual model was provided the same max_runtime_secs as the grid model each_model_runtime = pyunit_utils.find_grid_runtime([each_model]) manual_model = H2OPCA(**params_list) manual_model.train(x=self.x_indices, training_frame=self.training1_data, **model_params) # collect the time taken to manually built all models model_runtime = pyunit_utils.find_grid_runtime([manual_model]) # time taken to build this model manual_run_runtime += model_runtime if max_runtime > 0: # shortest possible time it takes to build this model if max_runtime < self.model_run_time: total_run_time_limits += model_runtime else: total_run_time_limits += max_runtime true_run_time_limits += max_runtime # compute and compare test metrics between the two models grid_model_metrics = \ sum(each_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]]) manual_model_metrics = \ sum(manual_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]]) # just compare the mse in this case within tolerance: if not((type(grid_model_metrics) == str) or (type(manual_model_metrics) == str)): if (abs(grid_model_metrics) > 0) and \ (abs(grid_model_metrics - manual_model_metrics)/grid_model_metrics > self.allowed_diff): print("test_PCA_grid_search_over_params for PCA warning: grid search model mdetric ({0}) " "and manually built H2O model metric ({1}) differ too much" "!".format(grid_model_metrics, manual_model_metrics)) total_run_time_limits = max(total_run_time_limits, true_run_time_limits) * (1+self.extra_time_fraction) # make sure the max_runtime_secs is working to restrict model built time if not(manual_run_runtime <= total_run_time_limits): self.test_failed += 1 print("test_PCA_grid_search_over_params for PCA failed: time taken to manually build models is {0}." " Maximum allowed time is {1}".format(manual_run_runtime, total_run_time_limits)) else: print("time taken to manually build all models is {0}. Maximum allowed time is " "{1}".format(manual_run_runtime, total_run_time_limits)) if self.test_failed == 0: print("test_PCA_grid_search_over_params for PCA has passed!") except Exception as e: if self.possible_number_models > 0: print("test_PCA_grid_search_over_params for PCA failed: exception ({0}) was thrown for no reason.".format(e)) self.test_failed += 1
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by PCA. 2. It will find the intersection of parameters that are both griddable and used by PCA. 3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method) model.train(x=self.x_indices, training_frame=self.training1_data) self.model_run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.model_run_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # scale the max_runtime_secs parameters time_scale = self.time_scale * self.model_run_time if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] if 'max_iterations' in list(self.hyper_params): self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']] # generate a new final_hyper_params which only takes a subset of all griddable parameters while # hyper_params take all griddable parameters and generate the grid search hyper-parameters [self.possible_number_models, self.final_hyper_params] = \ pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero, self.params_more_than_one, self.params_zero_positive, self.max_grid_model) # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this if ("max_runtime_secs" not in list(self.final_hyper_params)) and \ ("max_runtime_secs" in list(self.hyper_params)): self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"] len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = self.possible_number_models*len_good_time # must include k in hyper-parameters if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)): self.final_hyper_params["k"] = self.hyper_params["k"] len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)]) self.possible_number_models = self.possible_number_models*len_good_k # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.final_hyper_params)
def pca_scoring_history_importance(): """ This test aims to check and make sure PCA returns the scoring history and importance which are reported missing for certain PCA mode. Apart from changing the PCA mode, I throw in the transform type to test as well randomly. """ transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] print("Importing australia.csv data...\n") australia = h2o.upload_file(pyunit_utils.locate("smalldata/extdata/australia.csv")) col_indices = list(range(0, australia.ncol)) print("transform is {0}.\n".format(transformN)) # checking out PCA with GramSVD print("@@@@@@ Building PCA with GramSVD...\n") gramSVD = H2OPCA(k=3, transform=transformN) gramSVD.train(x=col_indices, training_frame=australia) # check PCA with PCA set to Randomized print("@@@@@@ Building PCA with Randomized...\n") randomizedPCA = H2OPCA(k=3, transform=transformN, pca_method="Randomized", compute_metrics=True, use_all_factor_levels=True) randomizedPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-3) print("@@@@@@ Comparing eigenvectors between GramSVD and Randomized...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=5e-2, check_sign=True) # check PCA with PCA set to Power print("@@@@@@ Building PCA with Power...\n") powerPCA = H2OPCA(k=3, transform=transformN, pca_method="Power", compute_metrics=True, use_all_factor_levels=True) powerPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"]) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-5, check_sign=True) # check PCA with PCA set to GLRM print("@@@@@@ Building PCA with GLRM...\n") glrmPCA = H2OPCA(k=3, transform=transformN, pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True) glrmPCA.train(x=col_indices, training_frame=australia) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=2e-2) print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=2e-1,check_sign=True) # make sure we find the scoring history and it is not empty for all the PCA modes # just check and make sure the cell_values exceed 0 assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GramSVD is empty." assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to using is empty." assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to Randomized is " \ "empty." assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \ "pca_method to GLRM is empty."
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that column names and column types are returned in the model output for every algorithm supported by H2O. See PUBDEV-5801. ''' seed = 12345 print("Checking GLM.....") training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) model = H2OGeneralizedLinearEstimator(family="binomial", alpha=1.0, lambda_search=False, max_iterations=2, seed=seed) checkColumnNamesTypesReturned( training1_data, model, ["displacement", "power", "weight", "acceleration", "year"], y_index="economy_20mpg") print("Checking GLRM.....") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) glrm_h2o = H2OGeneralizedLowRankEstimator(k=3, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, transform="STANDARDIZE") checkColumnNamesTypesReturned(irisH2O, glrm_h2o, irisH2O.names) print("Checking NaiveBayes......") model = H2ONaiveBayesEstimator(laplace=0.25) x_indices = irisH2O.names y_index = x_indices[-1] x_indices.remove(y_index) checkColumnNamesTypesReturned(irisH2O, model, x_indices, y_index=y_index) # deeplearning print("Checking deeplearning.....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) x_indices = training1_data.names y_index = x_indices[-1] x_indices.remove(y_index) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Test done in pyunit_stackedensemble_regression.py." ) # GBM run print("Checking GBM.....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) x_indices = training1_data.names y_index = x_indices[-1] x_indices.remove(y_index) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # random foreset print("Checking Random Forest.....") model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) checkColumnNamesTypesReturned(training1_data, model, x_indices, y_index=y_index) # PCA print("Checking PCA.....") training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = training1_data.names model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) checkColumnNamesTypesReturned(training1_data, model, x_indices) # kmeans print("Checking kmeans....") training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = training1_data.names model = H2OKMeansEstimator(k=10) checkColumnNamesTypesReturned(training1_data, model, x_indices) # word2vec print("Checking word2vec....") train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() checkColumnNamesTypesReturned(train, w2v_model, [], 0)
def pca_wideDataset_rotterdam(): h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names) - y) transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] print("transform used on dataset is {0}.\n".format(transformN)) buildModel = [False, False, False] buildModel[randint(0, len(buildModel) - 1)] = True expNum = 0 if (buildModel[expNum]): # special test with GLRM. Need use_all_levels to be true print("------ Testing GLRM PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345, use_all_factor_levels=True) gramSVD.train(x=x, training_frame=rotterdamH2O) glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform=transformN, seed=12345, init="Random", max_iterations=10, recover_svd=True, regularization_x="None", regularization_y="None") glrmPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1, check_all=False) # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove(gramSVD) h2o.remove(glrmPCA) expNum = expNum + 1 if (buildModel[expNum]): print("------ Testing Power PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345) # power powerPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-6, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) expNum = expNum + 1 if (buildModel[expNum]): print("------ Testing Randomized PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345, max_iterations=5) # power randomizedPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print( "@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n" ) pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], [ "Standard deviation", "Cumulative Proportion", "Cumulative Proportion" ], tolerance=1e-1, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal( gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove_all()
def algo_max_runtime_secs(): ''' This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all h2o algos. See PUBDEV-4702. ''' global model_within_max_runtime global err_bound seed = 12345 # GLRM, do not make sense to stop in the middle of an iteration training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=0.3, gamma_y=0.3, transform="STANDARDIZE") grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # deeplearning training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/gaussian_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) model = H2ODeepLearningEstimator(distribution='gaussian', seed=seed, hidden=[10, 10, 10]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # stack ensemble, stacking part is not iterative print( "******************** Skip testing stack ensemble. Not an iterative algo." ) # GBM run training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/multinomial_training1_set.csv")) y_index = training1_data.ncol - 1 x_indices = list(range(y_index)) training1_data[y_index] = training1_data[y_index].round().asfactor() model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # GLM run model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([model]) # naivebayes, not iterative print( "******************** Skip testing Naives Bayes. Not an iterative algo." ) # random foreset model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices) cleanUp([model, training1_data]) # deepwater if H2ODeepWaterEstimator.available(): training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) training1_data = training1_data.drop('Site') training1_data['Angaus'] = training1_data['Angaus'].asfactor() y_index = "Angaus" x_indices = list(range(1, training1_data.ncol)) model = H2ODeepWaterEstimator(epochs=50, hidden=[4096, 4096, 4096], hidden_dropout_ratios=[0.2, 0.2, 0.2]) grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index) cleanUp([training1_data, model]) # PCA training1_data = h2o.import_file( path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv")) x_indices = list(range(training1_data.ncol)) model = H2OPCA(k=10, transform="STANDARDIZE", pca_method="Power", compute_metrics=True) grabRuntimeInfo(err_bound * 5, 1.2, model, training1_data, x_indices) cleanUp([training1_data, model]) # kmeans training1_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gridsearch/kmeans_8_centers_3_coords.csv")) x_indices = list(range(training1_data.ncol)) model = H2OKMeansEstimator(k=10) grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices) cleanUp([training1_data, model]) # word2vec train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"), header=1, col_types=["string"]) used = train[0:170000, 0] w2v_model = H2OWord2vecEstimator() grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0) cleanUp([train, used, w2v_model]) if sum(model_within_max_runtime) > 0: sys.exit(1)
import h2o import pandas from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA if __name__ == '__main__': df = h2o.import_file("complete_df.csv") cols = df.columns # Train with the Power pca_method pca = H2OPCA(k=1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True, impute_missing=True) pca.train(x=cols[1:], training_frame=df) # View the importance of components print(pca.varimp(use_pandas=False)) # # View the eigenvectors eg = pca.rotation().as_data_frame() print(eg.sort_values('pc1', ascending=False)) # # Train again with the GLRM pca_method # birds2 = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv") # birds2.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "STANDARDIZE", # pca_method="GLRM", use_all_factor_levels=True, # impute_missing=True) # birds2.pca.train(x=list(range(4)), training_frame=birds2)
def pca_max_k(): data = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(data.names) - y) buildModel = [False, False, False, False] buildModel[randint(0, len(buildModel)-1)] = True # test 1 if buildModel[0]: pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100) pcaGramSVD.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1) correctEigNum = pcaPower.full_parameters["k"]["actual_value"] gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1 powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + gramSVDNum + "." assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + powerNum + "." # Randomized and GLRM does not have wide dataset implementation. Check with smaller datasets # test 2 data = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) x = list(set(data.names)) if buildModel[1]: pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", impute_missing=True, max_iterations=100, seed=12345) pcaRandomized.train(x, training_frame=data) pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100, seed=12345) pcaPower.train(x, training_frame=data) # eigenvalues between the PCA and Randomize should be close, I hope... print("@@@@@@ Comparing eigenvalues between Randomized and Power PCA...\n") pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"], pcaPower._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"]) # test 3 if buildModel[2]: # should still work with rank deficient dataset pcaRandomizedF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaRandomizedF.train(x, training_frame=data) # should still work with rank deficient dataset pcaPowerF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True, impute_missing=True, max_iterations=100, seed=12345) pcaPowerF.train(x, training_frame=data) # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope... print("@@@@@@ Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n") pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomizedF._model_json["output"]["importance"], pcaPowerF._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"]) # test 4 if buildModel[3]: pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True, max_iterations=100, seed=12345) pcaGLRM.train(x, training_frame=data) correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"] glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1 assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \ ", actual: " + glrmNum + "."