Esempio n. 1
0
def pca_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print(
        "Testing to see whether the trained PCA are essentially the same using different implementation..."
    )

    eigenvector_standard = None
    for impl in [
            "MTJ_EVD_DENSEMATRIX", "MTJ_EVD_SYMMMATRIX", "MTJ_SVD_DENSEMATRIX",
            "JAMA"
    ]:
        print("Run PCA with implementation: " + impl)
        model = H2OPCA(k=4, pca_impl=impl, seed=1234)
        model.train(x=list(range(4)), training_frame=arrestsH2O)
        eigenvectors = model._model_json["output"]["eigenvectors"]
        if eigenvector_standard is not None:
            # Compare to see if they are fundamentally the same
            pyunit_utils.assert_H2OTwoDimTable_equal(
                eigenvector_standard,
                eigenvectors,
                model._model_json["output"]["names"],
                tolerance=1e-6,
                check_sign=True,
                check_all=False)
        else:
            eigenvector_standard = eigenvectors
def glrm_arrests():
    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    pca_h2o = H2OPCA(k=4, transform="STANDARDIZE")
    pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
    pca_h2o.summary()
    pca_h2o.show()

    print("H2O GLRM on standardized data with quadratic loss:\n")
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=4,
                                              transform="STANDARDIZE",
                                              loss="Quadratic",
                                              gamma_x=0,
                                              gamma_y=0,
                                              init="SVD",
                                              recover_svd=True)
    glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O)
    glrm_h2o.show()

    # compare table values and make sure they are the same between PCA and GLRM
    assert pyunit_utils.equal_2d_tables(pca_h2o._model_json["output"]["importance"]._cell_values,
                                        glrm_h2o._model_json["output"]["importance"]._cell_values, tolerance=1e-4), \
      "PCA and GLRM variance metrics do not agree.  Fix it please."

    sys.stdout.flush()
Esempio n. 3
0
def screeplot_test():
    kwargs = {}
    kwargs['server'] = True
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv"))
    australia_pca = H2OPCA(k=4, transform="STANDARDIZE")
    australia_pca.train(x=list(range(8)), training_frame=australia)
    australia_pca.screeplot(type="barplot", **kwargs)
    australia_pca.screeplot(type="lines", **kwargs)
Esempio n. 4
0
def pca_pubdev_4167_OOM():
    """
  This pyunit is written to make sure PCA works with customer data.  It is mainly used by customer to verify
  PCA operations and not to be used as a regular test since I do not want to expose customer data.
  """
    h2o.remove_all()
    transform_types = [
        "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"
    ]  # make sure we check all tranforms
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))

    training_data = h2o.import_file(path=pyunit_utils.locate(
        "/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar")
                                    )  # Nidhi: import may not work

    gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN)
    gramSVDPCA.train(x=list(range(0, training_data.ncols)),
                     training_frame=training_data)

    powerSVDPCA = H2OPCA(k=training_data.ncols,
                         transform=transformN,
                         pca_method="Power")
    powerSVDPCA.train(x=list(range(0, training_data.ncols)),
                      training_frame=training_data)

    # compare singular values and stuff between power and GramSVD methods
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["importance"],
        powerSVDPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-5,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["eigenvectors"],
        powerSVDPCA._model_json["output"]["names"],
        tolerance=1e-1,
        check_sign=True)
Esempio n. 5
0
def pca_prostate():
    print("Importing prostate.csv data...\n")
    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors")
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    fitPCA = H2OPCA(k=1, transform="NONE", pca_method="Power", seed=1234)
    fitPCA.train(x=list(range(2, 9)), training_frame=prostate)
    fitPCA_noK = H2OPCA(transform="NONE", pca_method="Power", seed=1234)
    fitPCA_noK.train(x=list(range(2, 9)), training_frame=prostate)
    pred = fitPCA.predict(prostate)
    predNoK = fitPCA_noK.predict(prostate)
    pyunit_utils.compare_frames_local(pred, predNoK, prob=1, tol=1e-10)
Esempio n. 6
0
def pca_max_k():
    data = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/SDSS_quasar.txt.zip"))
    x = list(set(data.names))

    pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100)
    pcaGramSVD.train(x, training_frame=data)
    pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100,
                      seed=12345)
    pcaPower.train(x, training_frame=data)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1)

    correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
    gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
    powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
    assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized",
                           impute_missing=True, max_iterations=100, seed=12345)
    pcaRandomized.train(x, training_frame=data)

    # eigenvalues between the PCA and Randomize should be close, I hope...
    print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])

    pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True,
                     max_iterations=100, seed=12345)
    pcaGLRM.train(x, training_frame=data)
    correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
    glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
    assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                     ", actual: " + glrmNum + "."
Esempio n. 7
0
def pca_wideDataset_rotterdam_pcapower():
    tol = 2e-5
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    print("------  Testing Power PCA --------")
    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform="STANDARDIZE",
                     seed=12345)
    gramSVD.train(x=x, training_frame=rotterdamH2O)
    powerPCA = H2OPCA(k=8,
                      impute_missing=True,
                      transform="STANDARDIZE",
                      pca_method="Power",
                      seed=12345)  # power
    powerPCA.train(x=x, training_frame=rotterdamH2O)
    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        powerPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-6,
        check_all=False)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors

    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["eigenvectors"],
        powerPCA._model_json["output"]["names"],
        tolerance=tol,
        check_sign=True,
        check_all=False)
Esempio n. 8
0
def pca_arrests():

    print("Importing USArrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrestsH2O.describe()

    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrestsH2O.names))
        pca_h2o = H2OPCA(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrestsH2O)
Esempio n. 9
0
def pca_pubdev_4314():
    print("Importing prostate_cat.csv data...\n")
    prostate = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    prostate.describe()
    print("PCA with k = 3, retx = FALSE, transform = 'STANDARDIZE'")
    fitPCA = H2OPCA(k=3, transform="StANDARDIZE", pca_method="GramSVD")
    fitPCA.train(x=list(range(0,8)), training_frame=prostate)
    print(fitPCA.summary())
    varimpPandas = fitPCA.varimp(use_pandas=True)
    assert_is_type(varimpPandas, DataFrame)
    varimpList = fitPCA.varimp()
    print(varimpList)
    assert_is_type(varimpList, list)
    sys.stdout.flush()
Esempio n. 10
0
def pca_wideDataset_rotterdam_glrm():
    tol = 2e-5
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    # special test with GLRM.  Need use_all_levels to be true
    print("------  Testing GLRM PCA --------")
    gramSVD = H2OPCA(k=8,
                     impute_missing=True,
                     transform="DEMEAN",
                     seed=12345,
                     use_all_factor_levels=True)
    gramSVD.train(x=x, training_frame=rotterdamH2O)

    glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                             transform="DEMEAN",
                                             seed=12345,
                                             init="Random",
                                             recover_svd=True,
                                             regularization_x="None",
                                             regularization_y="None",
                                             max_iterations=11)
    glrmPCA.train(x=x, training_frame=rotterdamH2O)

    # compare singular values and stuff with GramSVD
    print(
        "@@@@@@  Comparing eigenvectors and eigenvalues between GramSVD and GLRM...\n"
    )
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["importance"],
        glrmPCA._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1,
        check_all=False)

    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        gramSVD._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["eigenvectors"],
        glrmPCA._model_json["output"]["names"],
        tolerance=tol,
        check_sign=True,
        check_all=False)
Esempio n. 11
0
def pca_scoring():

    print("Importing arrests.csv data...")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print("Run PCA with transform = 'DEMEAN'")

    fitH2O = H2OPCA(k=4, transform="DEMEAN")
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)
    # TODO: fitH2O.show()

    print("Project training data into eigenvector subspace")
    predH2O = fitH2O.predict(arrestsH2O)
    print("H2O Projection:")
    predH2O.head()
def glrm_iris():
    print("Importing iris.csv data...")
    irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    irisH2O.describe()

    print("@@@@@@  Building PCA with GramSVD...\n")
    glrmPCA = H2OPCA(k=5,
                     transform="STANDARDIZE",
                     pca_method="GLRM",
                     use_all_factor_levels=True,
                     seed=21)
    glrmPCA.train(x=irisH2O.names, training_frame=irisH2O)

    glrm_h2o = H2OGeneralizedLowRankEstimator(k=5,
                                              loss="Quadratic",
                                              transform="STANDARDIZE",
                                              recover_svd=True,
                                              seed=21)
    glrm_h2o.train(x=irisH2O.names, training_frame=irisH2O)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(
        glrmPCA._model_json["output"]["importance"],
        glrm_h2o._model_json["output"]["importance"], [
            "Standard deviation", "Cumulative Proportion",
            "Cumulative Proportion"
        ],
        tolerance=1e-6)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")

    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(
        glrmPCA._model_json["output"]["eigenvectors"],
        glrm_h2o._model_json["output"]["eigenvectors"],
        glrm_h2o._model_json["output"]["names"],
        tolerance=1e-6,
        check_sign=True)

    # check to make sure maximum proportional variance <= 1
    assert glrmPCA._model_json["output"]["importance"].cell_values[1][1] <= 1, \
      "Expected value <= 1.0 but received {0}".format(glrmPCA._model_json["output"]["importance"].cell_values[1][1])
Esempio n. 13
0
def pca_prostate():

    print("Importing prostate.csv data...\n")
    prostate = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    print("Converting CAPSULE, RACE, DPROS and DCAPS columns to factors")
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print(
        "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"
    )

    fitPCA = H2OPCA(k=3, transform="NONE", pca_method="Power")
    fitPCA.train(x=list(range(2, 9)), training_frame=prostate)
    pred = fitPCA.predict(prostate)

    print("Projection matrix:\n")
    pred.head()
Esempio n. 14
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the various model will not crash if the max_runtime_secs
    is set to be too short.  See PUBDEV-4802.
    '''
    global model_within_max_runtime
    seed = 12345

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # PCA, pca_method=Power
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=Randomized
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Randomized",
                   compute_metrics=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # PCA, pca_method=GLRM
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="GLRM",
                   compute_metrics=True,
                   use_all_factor_levels=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE",
                                           recover_svd=True)
    grabRuntimeInfo(model, training1_data, x_indices)
    cleanUp([training1_data, model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
    def test_PCA_grid_search_over_params(self):
        """
        test_pca_grid_search_over_params: test for condition 1 and performs the following:
        a. build H2O PCA models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        c. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O PCA model.  Training metrics are calculated from the
           gridsearch model and the manually built model.  If their metrics
           differ by too much, print a warning message but don't fail the test.
        d. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure.
        """
        print("*******************************************************************************************")
        print("test_PCA_grid_search_over_params for PCA ")
        h2o.cluster_info()

        try:
            print("Hyper-parameters used here is {0}".format(self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(H2OPCA(pca_method=self.pca_method),
                                       hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices, training_frame=self.training1_data)

            self.correct_model_number = len(grid_model)     # store number of models built

            # make sure the correct number of models are built by gridsearch
            if (self.correct_model_number - self.possible_number_models)>0.9:  # wrong grid model number
                self.test_failed += 1
                print("test_PCA_grid_search_over_params for PCA failed: number of models built by gridsearch: {0} "
                      "does not equal to all possible combinations of hyper-parameters: "
                      "{1}".format(self.correct_model_number, self.possible_number_models))
            else:
                # add parameters into params_dict.  Use this to manually build model
                params_dict = dict()
                params_dict["pca_method"] = self.pca_method
                total_run_time_limits = 0.0   # calculate upper bound of max_runtime_secs
                true_run_time_limits = 0.0
                manual_run_runtime = 0.0

                # compare performance metric of model built by gridsearch with manually built model
                for each_model in grid_model:

                    params_list = grid_model.get_hyperparams_dict(each_model._id)
                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list["max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    # make sure manual model was provided the same max_runtime_secs as the grid model
                    each_model_runtime = pyunit_utils.find_grid_runtime([each_model])

                    manual_model = H2OPCA(**params_list)
                    manual_model.train(x=self.x_indices, training_frame=self.training1_data,
                                       **model_params)

                    # collect the time taken to manually built all models
                    model_runtime = pyunit_utils.find_grid_runtime([manual_model])  # time taken to build this model
                    manual_run_runtime += model_runtime

                    if max_runtime > 0:
                        # shortest possible time it takes to build this model
                        if max_runtime < self.model_run_time:
                            total_run_time_limits += model_runtime
                        else:
                            total_run_time_limits += max_runtime

                    true_run_time_limits += max_runtime

                    # compute and compare test metrics between the two models
                    grid_model_metrics = \
                        sum(each_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]])
                    manual_model_metrics = \
                        sum(manual_model._model_json["output"]["model_summary"].cell_values[0][1:params_list["k"]])

                    # just compare the mse in this case within tolerance:
                    if not((type(grid_model_metrics) == str) or (type(manual_model_metrics) == str)):
                        if (abs(grid_model_metrics) > 0) and \
                                (abs(grid_model_metrics - manual_model_metrics)/grid_model_metrics > self.allowed_diff):
                            print("test_PCA_grid_search_over_params for PCA warning: grid search model mdetric ({0}) "
                                  "and manually built H2O model metric ({1}) differ too much"
                                  "!".format(grid_model_metrics, manual_model_metrics))

                total_run_time_limits = max(total_run_time_limits, true_run_time_limits) * (1+self.extra_time_fraction)

                # make sure the max_runtime_secs is working to restrict model built time
                if not(manual_run_runtime <= total_run_time_limits):
                    self.test_failed += 1
                    print("test_PCA_grid_search_over_params for PCA failed: time taken to manually build models is {0}."
                          "  Maximum allowed time is {1}".format(manual_run_runtime, total_run_time_limits))
                else:
                    print("time taken to manually build all models is {0}. Maximum allowed time is "
                          "{1}".format(manual_run_runtime, total_run_time_limits))

                if self.test_failed == 0:
                    print("test_PCA_grid_search_over_params for PCA has passed!")
        except Exception as e:
            if self.possible_number_models > 0:
                print("test_PCA_grid_search_over_params for PCA failed: exception ({0}) was thrown for no reason.".format(e))
                self.test_failed += 1
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by PCA.
        2. It will find the intersection of parameters that are both griddable and used by PCA.
        3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method)
        model.train(x=self.x_indices, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        if 'max_iterations' in list(self.hyper_params):
            self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # must include k in hyper-parameters
        if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)):
            self.final_hyper_params["k"] = self.hyper_params["k"]
            len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)])
            self.possible_number_models = self.possible_number_models*len_good_k

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
def pca_scoring_history_importance():
    """
    This test aims to check and make sure PCA returns the scoring history and importance which are
    reported missing for certain PCA mode.  Apart from changing the PCA mode, I throw in the transform
    type to test as well randomly.
    """
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    print("Importing australia.csv data...\n")
    australia = h2o.upload_file(pyunit_utils.locate("smalldata/extdata/australia.csv"))
    col_indices = list(range(0, australia.ncol))

    print("transform is {0}.\n".format(transformN))
    # checking out PCA with GramSVD
    print("@@@@@@  Building PCA with GramSVD...\n")
    gramSVD = H2OPCA(k=3, transform=transformN)
    gramSVD.train(x=col_indices, training_frame=australia)

    # check PCA with PCA set to Randomized
    print("@@@@@@  Building PCA with Randomized...\n")
    randomizedPCA = H2OPCA(k=3, transform=transformN, pca_method="Randomized", compute_metrics=True,
                         use_all_factor_levels=True)
    randomizedPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           randomizedPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-3)
    print("@@@@@@  Comparing eigenvectors between GramSVD and Randomized...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           randomizedPCA._model_json["output"]["eigenvectors"],
                                           randomizedPCA._model_json["output"]["names"], tolerance=5e-2,
                                           check_sign=True)

    # check PCA with PCA set to Power
    print("@@@@@@  Building PCA with Power...\n")
    powerPCA = H2OPCA(k=3, transform=transformN, pca_method="Power", compute_metrics=True, use_all_factor_levels=True)
    powerPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           powerPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])
    print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           powerPCA._model_json["output"]["eigenvectors"],
                                           powerPCA._model_json["output"]["names"], tolerance=1e-5, check_sign=True)

    # check PCA with PCA set to GLRM
    print("@@@@@@  Building PCA with GLRM...\n")
    glrmPCA = H2OPCA(k=3, transform=transformN, pca_method="GLRM", compute_metrics=True, use_all_factor_levels=True)
    glrmPCA.train(x=col_indices, training_frame=australia)

    # compare singular values and stuff with GramSVD
    print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                           glrmPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=2e-2)
    print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
    # compare singular vectors
    pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                           glrmPCA._model_json["output"]["eigenvectors"],
                                           glrmPCA._model_json["output"]["names"], tolerance=2e-1,check_sign=True)

    # make sure we find the scoring history and it is not empty for all the PCA modes
    # just check and make sure the cell_values exceed 0
    assert len(gramSVD._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                "pca_method to GramSVD is empty."
    assert len(powerPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                 "pca_method to using is empty."
    assert len(randomizedPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                      "pca_method to Randomized is " \
                                                                                      "empty."
    assert len(glrmPCA._model_json["output"]["scoring_history"].cell_values) > 0, "PCA Scoring history setting " \
                                                                                  "pca_method to GLRM is empty."
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that column names and column types are returned in the model
      output for every algorithm supported by H2O.  See PUBDEV-5801.
    '''
    seed = 12345
    print("Checking GLM.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    model = H2OGeneralizedLinearEstimator(family="binomial",
                                          alpha=1.0,
                                          lambda_search=False,
                                          max_iterations=2,
                                          seed=seed)
    checkColumnNamesTypesReturned(
        training1_data,
        model, ["displacement", "power", "weight", "acceleration", "year"],
        y_index="economy_20mpg")

    print("Checking GLRM.....")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    glrm_h2o = H2OGeneralizedLowRankEstimator(k=3,
                                              loss="Quadratic",
                                              gamma_x=0.5,
                                              gamma_y=0.5,
                                              transform="STANDARDIZE")
    checkColumnNamesTypesReturned(irisH2O, glrm_h2o, irisH2O.names)

    print("Checking NaiveBayes......")
    model = H2ONaiveBayesEstimator(laplace=0.25)
    x_indices = irisH2O.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    checkColumnNamesTypesReturned(irisH2O, model, x_indices, y_index=y_index)

    # deeplearning
    print("Checking deeplearning.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Test done in pyunit_stackedensemble_regression.py."
    )

    # GBM run
    print("Checking GBM.....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    x_indices = training1_data.names
    y_index = x_indices[-1]
    x_indices.remove(y_index)
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # random foreset
    print("Checking Random Forest.....")
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    checkColumnNamesTypesReturned(training1_data,
                                  model,
                                  x_indices,
                                  y_index=y_index)

    # PCA
    print("Checking PCA.....")
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = training1_data.names
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # kmeans
    print("Checking kmeans....")
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = training1_data.names
    model = H2OKMeansEstimator(k=10)
    checkColumnNamesTypesReturned(training1_data, model, x_indices)

    # word2vec
    print("Checking word2vec....")
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    checkColumnNamesTypesReturned(train, w2v_model, [], 0)
Esempio n. 19
0
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names) - y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("transform used on dataset is {0}.\n".format(transformN))
    buildModel = [False, False, False]
    buildModel[randint(0, len(buildModel) - 1)] = True

    expNum = 0
    if (buildModel[expNum]):
        # special test with GLRM.  Need use_all_levels to be true
        print("------  Testing GLRM PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345,
                         use_all_factor_levels=True)
        gramSVD.train(x=x, training_frame=rotterdamH2O)

        glrmPCA = H2OGeneralizedLowRankEstimator(k=8,
                                                 transform=transformN,
                                                 seed=12345,
                                                 init="Random",
                                                 max_iterations=10,
                                                 recover_svd=True,
                                                 regularization_x="None",
                                                 regularization_y="None")
        glrmPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
        print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            glrmPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1,
            check_all=False)

        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["eigenvectors"],
            glrmPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
        h2o.remove(gramSVD)
        h2o.remove(glrmPCA)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Power PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        powerPCA = H2OPCA(k=8,
                          impute_missing=True,
                          transform=transformN,
                          pca_method="Power",
                          seed=12345)  # power
        powerPCA.train(x=x, training_frame=rotterdamH2O)
        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            powerPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-6,
            check_all=False)
        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors

        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["eigenvectors"],
            powerPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)

    expNum = expNum + 1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8,
                         impute_missing=True,
                         transform=transformN,
                         seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8,
                               impute_missing=True,
                               transform=transformN,
                               pca_method="Randomized",
                               seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print(
            "@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n"
        )
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["importance"],
            randomizedPCA._model_json["output"]["importance"], [
                "Standard deviation", "Cumulative Proportion",
                "Cumulative Proportion"
            ],
            tolerance=1e-1,
            check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(
            gramSVD._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["eigenvectors"],
            randomizedPCA._model_json["output"]["names"],
            tolerance=1e-6,
            check_sign=True,
            check_all=False)
    h2o.remove_all()
Esempio n. 20
0
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all
    h2o algos.  See PUBDEV-4702.
    '''
    global model_within_max_runtime
    global err_bound
    seed = 12345

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE")
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100, score_tree_interval=0)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # deepwater
    if H2ODeepWaterEstimator.available():
        training1_data = h2o.import_file(
            path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv"))
        training1_data = training1_data.drop('Site')
        training1_data['Angaus'] = training1_data['Angaus'].asfactor()
        y_index = "Angaus"
        x_indices = list(range(1, training1_data.ncol))
        model = H2ODeepWaterEstimator(epochs=50,
                                      hidden=[4096, 4096, 4096],
                                      hidden_dropout_ratios=[0.2, 0.2, 0.2])
        grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices,
                        y_index)
        cleanUp([training1_data, model])

    # PCA
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True)
    grabRuntimeInfo(err_bound * 5, 1.2, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10)
    grabRuntimeInfo(err_bound * 2, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
Esempio n. 21
0
import h2o
import pandas
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA

if __name__ == '__main__':

    df = h2o.import_file("complete_df.csv")

    cols = df.columns

    # Train with the Power pca_method
    pca = H2OPCA(k=1,
                 transform="STANDARDIZE",
                 pca_method="Power",
                 use_all_factor_levels=True,
                 impute_missing=True)
    pca.train(x=cols[1:], training_frame=df)

    # View the importance of components
    print(pca.varimp(use_pandas=False))

    # # View the eigenvectors
    eg = pca.rotation().as_data_frame()
    print(eg.sort_values('pc1', ascending=False))

# # Train again with the GLRM pca_method
# birds2 = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/pca_test/birds.csv")
# birds2.pca = H2OPrincipalComponentAnalysisEstimator(k = 3, transform = "STANDARDIZE",
#                     pca_method="GLRM", use_all_factor_levels=True,
#                     impute_missing=True)
# birds2.pca.train(x=list(range(4)), training_frame=birds2)
def pca_max_k():
    data = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(data.names) - y)

    buildModel = [False, False, False, False]
    buildModel[randint(0, len(buildModel)-1)] = True
    # test 1

    if buildModel[0]:
        pcaGramSVD = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GramSVD", impute_missing=True, max_iterations=100)
        pcaGramSVD.train(x, training_frame=data)
        pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", impute_missing=True, max_iterations=100,
                      seed=12345)
        pcaPower.train(x, training_frame=data)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaGramSVD._model_json["output"]["importance"],
                                             pcaPower._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1)

        correctEigNum = pcaPower.full_parameters["k"]["actual_value"]
        gramSVDNum = len(pcaGramSVD._model_json["output"]["importance"].cell_values[0]) - 1
        powerNum = len(pcaPower._model_json["output"]["importance"].cell_values[0]) - 1
        assert correctEigNum == gramSVDNum, "PCA GramSVD FAIL: expected number of eigenvalues: " + correctEigNum + \
                                        ", actual: " + gramSVDNum + "."
        assert correctEigNum == powerNum, "PCA Power FAIL: expected number of eigenvalues: " + correctEigNum + \
                                      ", actual: " + powerNum + "."

    # Randomized and GLRM does not have wide dataset implementation.  Check with smaller datasets
    # test 2
    data = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    x = list(set(data.names))
    if buildModel[1]:
        pcaRandomized = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized",
                               impute_missing=True, max_iterations=100, seed=12345)
        pcaRandomized.train(x, training_frame=data)

        pcaPower = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power",
                          impute_missing=True, max_iterations=100, seed=12345)
        pcaPower.train(x, training_frame=data)
        # eigenvalues between the PCA and Randomize should be close, I hope...
        print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomized._model_json["output"]["importance"],
                                                 pcaPower._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])
     # test 3
    if buildModel[2]:
        # should still work with rank deficient dataset
        pcaRandomizedF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Randomized", use_all_factor_levels=True,
                               impute_missing=True, max_iterations=100, seed=12345)
        pcaRandomizedF.train(x, training_frame=data)
        # should still work with rank deficient dataset
        pcaPowerF = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="Power", use_all_factor_levels=True,
                            impute_missing=True, max_iterations=100, seed=12345)
        pcaPowerF.train(x, training_frame=data)



        # eigenvalues between the PCA and Randomize should be close with rank deficient dataset, I hope...
        print("@@@@@@  Comparing eigenvalues between Randomized and Power PCA with rank deficient dataset...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(pcaRandomizedF._model_json["output"]["importance"],
                                                 pcaPowerF._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"])

    # test 4
    if buildModel[3]:
        pcaGLRM = H2OPCA(k=-1, transform="STANDARDIZE", pca_method="GLRM", use_all_factor_levels=True,
                         max_iterations=100, seed=12345)
        pcaGLRM.train(x, training_frame=data)
        correctEigNum = pcaGLRM.full_parameters["k"]["actual_value"]
        glrmNum = len(pcaGLRM._model_json["output"]["importance"].cell_values[0]) - 1
        assert correctEigNum == glrmNum, "PCA GLRM FAIL: expected number of eigenvalues: " + correctEigNum + \
                                         ", actual: " + glrmNum + "."