def pca_prostate(ip, port): h2o.init(ip, port) print "Importing prostate.csv data...\n" prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power") pred1 = fitPCA.predict(prostate) pred2 = h2o.get_frame(fitPCA._model_json['output']['loading_key']['name']) print "Compare dimensions of projection and loading matrix" print "Projection matrix:\n" print pred1.head() print "Loading matrix:\n" print pred2.head() assert pred1.nrow() == pred2.nrow(), "Expected same number of rows, but got {0} and {1}".format(pred1.nrow(), pred2.nrow()) assert pred1.ncol() == pred2.ncol(), "Expected same number of rows, but got {0} and {1}".format(pred1.ncol(), pred2.ncol())
def screeplot_test(): kwargs = {} kwargs['server'] = True australia = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = h2o.prcomp(x=australia[0:8], k = 4, transform = "STANDARDIZE") australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs)
def screeplot_test(ip,port): # Connect to h2o h2o.init(ip,port) kwargs = {} kwargs['server'] = True australia = h2o.upload_file(h2o.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = h2o.prcomp(x=australia[0:8], k = 4, transform = "STANDARDIZE") australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs)
def screeplot_test(ip, port): kwargs = {} kwargs['server'] = True australia = h2o.upload_file( h2o.locate("smalldata/pca_test/AustraliaCoast.csv")) australia_pca = h2o.prcomp(x=australia[0:8], k=4, transform="STANDARDIZE") australia_pca.screeplot(type="barplot", **kwargs) australia_pca.screeplot(type="lines", **kwargs)
def pca_arrests(ip, port): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() for i in range(4): print "H2O PCA with " + str(i) + " dimensions:\n" print "Using these columns: {0}".format(arrestsH2O.names) pca_h2o = h2o.prcomp(x=arrestsH2O[0:4], k = i+1)
def pca_arrests(ip, port): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file( h2o.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() for i in range(4): print "H2O PCA with " + str(i) + " dimensions:\n" print "Using these columns: {0}".format(arrestsH2O.names()) pca_h2o = h2o.prcomp(x=arrestsH2O[0:4], k=i + 1)
def pca_scoring(): print "Importing arrests.csv data..." arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) print "Run PCA with transform = 'DEMEAN'" fitH2O = h2o.prcomp(x=arrestsH2O[0:4], k=4, transform="DEMEAN") # TODO: fitH2O.show() print "Project training data into eigenvector subspace" predH2O = fitH2O.predict(arrestsH2O) print "H2O Projection:" print predH2O.head()
def pca_scoring(): print "Importing arrests.csv data..." arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print "Run PCA with transform = 'DEMEAN'" fitH2O = h2o.prcomp(x=arrestsH2O[0:4], k=4, transform="DEMEAN") # TODO: fitH2O.show() print "Project training data into eigenvector subspace" predH2O = fitH2O.predict(arrestsH2O) print "H2O Projection:" predH2O.head()
def pca_prostate(): print "Importing prostate.csv data...\n" prostate = h2o.upload_file(tests.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power") pred = fitPCA.predict(prostate) print "Projection matrix:\n" print pred.head()
def pca_prostate(ip, port): print "Importing prostate.csv data...\n" prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power") pred = fitPCA.predict(prostate) print "Projection matrix:\n" print pred.head()