def directory_import():

    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")

    small1 = tests.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def pubdev_1421():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        hdfs_airlines_test_file = "/datasets/airlines.test.csv"

        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file)
        air_test = h2o.import_file(url)
    else:
        raise (EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def pubdev_1431():

    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def hdfs_basic():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir  = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print "Testing single file importHDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(type(iris_h2o))
        print "Import worked"

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print "Testing directory importHDFS"
        urls = ["hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir),
                "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)]
        iris_dir_h2o = h2o.import_file(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print "Import worked"
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def kmeans_mllib():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(tests.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(k)
            cross_km = h2o.kmeans(training_frame = cross_h2o, x = cross_h2o, k = k, init = "PlusPlus",
                                  max_iterations = 10, standardize = False)

            clust_mllib = np.genfromtxt(tests.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"),
                                        delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4,0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def hdfs_kmeans():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print "Import iris_wheader.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

        print "Running KMeans on iris"
        iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10)
        print iris_km

        print "Importing covtype.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

        print "Running KMeans on covtype"
        covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10)
        print covtype_km

    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def hdfs_kmeans_airlines():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11,16) + range(18,21) + range(24,29) + [9]
        airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest",
                                 max_iterations = 10, standardize = True)
        print airlines_km
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")