def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9] airlines_km = h2o.kmeans(training_frame=airlines_h2o, x=airlines_h2o[myX], k=7, init="Furthest", max_iterations=10, standardize=True) print airlines_km else: print "Not running on H2O internal network. No access to HDFS."
def directory_import(): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv") url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/") print "Importing HDFS file {0} and directory {1}".format(url1, url2) frm_one = h2o.import_file(url1) frm_all = h2o.import_file(url2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca) small1 = h2o.locate("smalldata/jira/identical_files/iris1.csv") small2 = small1.split("iris1.csv")[0] print "Importing smalldata file {0} and directory {1}".format(small1, small2) frm_one = h2o.import_file(small1) frm_all = h2o.import_file(small2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file_1 = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1) airlines_billion_1 = h2o.import_file(url) airlines_billion_1[30] = airlines_billion_1[30].asfactor() gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion_1) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) airlines_billion_2 = h2o.import_file(csv) os.remove(csv) r1, c1 = airlines_billion_1.dim r2, c2 = airlines_billion_2.dim assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \ "c2: {1}".format(r1,r2,c1,c2) else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_basic(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" #---------------------------------------------------------------------- # Single file cases. #---------------------------------------------------------------------- print "Testing single file importHDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) iris_h2o.head() iris_h2o.tail() n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance( iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format( type(iris_h2o)) print "Import worked" #---------------------------------------------------------------------- # Directory file cases. #---------------------------------------------------------------------- print "Testing directory importHDFS" urls = [ "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir) ] iris_dir_h2o = h2o.import_file(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print "Import worked" else: print "Not running on H2O internal network. No access to HDFS."
def pubdev_1421(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_airlines_test_file = "/datasets/airlines.test.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file) air_test = h2o.import_file(url)
def kmeans_mllib(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( h2o.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print "Run k-means++ with k = {0} and max_iterations = 10".format( k) cross_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False) clust_mllib = np.genfromtxt( h2o.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print "\nMLlib Cluster Centers:\n" print clust_mllib print "\nH2O Cluster Centers:\n" print clust_h2o wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = cross_km.tot_withinss() / n print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib) print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o)
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print "rows: {0}".format(n) ncent = 3 miters = 10 print "Run k-means with k = {0} and max_iterations = {1}".format( ncent, miters) cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print cross1_km print "Run k-means with init = final cluster centers and max_iterations = 1" init_centers = h2o.H2OFrame(cross1_km.centers()) init_centers_key = init_centers.send_frame() cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers_key, max_iterations=1) print cross2_km print "Check k-means converged or maximum iterations reached" c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = ((c1 - c2)**2).sum() / ncent iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: print "Not running on H2O internal network. No access to HDFS."
def pubdev_1431(ip, port): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) os.remove(csv) else: print "Not running on H2O internal network. No access to HDFS."
def pubdev_1431(): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) os.remove(csv) else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_basic(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" #---------------------------------------------------------------------- # Single file cases. #---------------------------------------------------------------------- print "Testing single file importHDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) iris_h2o.head() iris_h2o.tail() n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) assert isinstance(iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(type(iris_h2o)) print "Import worked" #---------------------------------------------------------------------- # Directory file cases. #---------------------------------------------------------------------- print "Testing directory importHDFS" urls = ["hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)] iris_dir_h2o = h2o.import_file(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print "Import worked" else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_kmeans(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print "Import iris_wheader.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) print "Running KMeans on iris" iris_km = h2o.kmeans(training_frame=iris_h2o, k=3, x=iris_h2o[0:4], max_iterations=10) print iris_km print "Importing covtype.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print "rows: {0}".format(n) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 581012) print "Running KMeans on covtype" covtype_km = h2o.kmeans(training_frame=covtype_h2o, x=covtype_h2o[0:55], k=8, max_iterations=10) print covtype_km else: print "Not running on H2O internal network. No access to HDFS."
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt(h2o.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print "Run k-means++ with k = {0} and max_iterations = 10".format(k) cross_km = h2o.kmeans(training_frame = cross_h2o, x = cross_h2o, k = k, init = "PlusPlus", max_iterations = 10, standardize = False) clust_mllib = np.genfromtxt(h2o.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print "\nMLlib Cluster Centers:\n" print clust_mllib print "\nH2O Cluster Centers:\n" print clust_h2o wcsse_mllib = err_mllib[err_mllib[0:4,0].tolist().index(k)][1] wcsse_h2o = cross_km.tot_withinss() / n print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib) print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o)
def hdfs_kmeans_converge(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print "rows: {0}".format(n) ncent = 3 miters = 10 print "Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters) cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters) print cross1_km print "Run k-means with init = final cluster centers and max_iterations = 1" init_centers = h2o.H2OFrame(cross1_km.centers()) init_centers_key = init_centers.send_frame() cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers_key, max_iterations = 1) print cross2_km print "Check k-means converged or maximum iterations reached" c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = ((c1-c2)**2).sum() / ncent iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: print "Not running on H2O internal network. No access to HDFS."
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11,16) + range(18,21) + range(24,29) + [9] airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest", max_iterations = 10, standardize = True) print airlines_km else: print "Not running on H2O internal network. No access to HDFS."
def directory_import(): running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() url1 = "hdfs://{0}{1}".format( hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv") url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/") print "Importing HDFS file {0} and directory {1}".format(url1, url2) frm_one = h2o.import_file(url1) frm_all = h2o.import_file(url2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1 * 3 == ra, "Expected 3 times the rows, but got {0} and {1}".format( r1, ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format( c1, ca) small1 = tests.locate("smalldata/jira/identical_files/iris1.csv") small2 = small1.split("iris1.csv")[0] print "Importing smalldata file {0} and directory {1}".format( small1, small2) frm_one = h2o.import_file(small1) frm_all = h2o.import_file(small2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1 * 3 == ra, "Expected 3 times the rows, but got {0} and {1}".format( r1, ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format( c1, ca)
def hdfs_kmeans(ip, port): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. running_inside_h2o = tests.is_running_internal_to_h2o() if running_inside_h2o: hdfs_name_node = tests.get_h2o_internal_hdfs_name_node() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print "Import iris_wheader.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print "Running KMeans on iris" iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10) print iris_km print "Importing covtype.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print "rows: {0}".format(n) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print "Running KMeans on covtype" covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10) print covtype_km else: print "Not running on H2O internal network. No access to HDFS."