Ejemplo n.º 1
0
def frame_math_ops(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    sin_cos_tan_atan_sinh_cosh_tanh_asinh_data = [[random.uniform(-10,10) for r in range(10)] for c in range(10)]
    asin_acos_atanh_data = [[random.uniform(-1,1) for r in range(10)] for c in range(10)]
    acosh_data = [[random.uniform(1,10) for r in range(10)] for c in range(10)]
    abs_data = [[random.uniform(-100000,0) for r in range(10)] for c in range(10)]
    signif_data = [[0.0000123456, 1], [2, 3]]

    h2o_data1 = h2o.H2OFrame(python_obj=sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    h2o_data2 = h2o.H2OFrame(python_obj=asin_acos_atanh_data)
    h2o_data3 = h2o.H2OFrame(python_obj=acosh_data)
    h2o_data4 = h2o.H2OFrame(python_obj=abs_data)
    h2o_data5 = h2o.H2OFrame(python_obj=signif_data)

    np_data1 = np.array(sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    np_data2 = np.array(asin_acos_atanh_data)
    np_data3 = np.array(acosh_data)
    np_data4 = np.array(abs_data)

    for d in range(1,6):
        h2o_signif = h2o_data5.signif(digits=d)
        h2o_round = h2o_data5.round(digits=d+4)
        s = h2o_signif[0,0]
        r = h2o_round[0,0]
        assert s == r, "Expected these to be equal, but signif: {0}, round: {1}".format(s, r)
    h2o_transposed = h2o_data1[0:5].transpose()
    r, c = h2o_transposed.dim()
    assert r == 5 and c == 10, "Expected 5 rows and 10 columns, but got {0} rows and {1} columns".format(r,c)
    h2o.np_comparison_check(h2o_transposed, np.transpose(np_data1[:,0:5]), 10)
    h2o.np_comparison_check(h2o_data1.cos(), np.cos(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.sin(), np.sin(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.tan(), np.tan(np_data1), 10)
Ejemplo n.º 2
0
def expr_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    iris.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res = 2 - iris
    res2 = res[0]
    assert abs(res2[3,:] - -2.6) < 1e-10 and abs(res2[17,:] - -3.1) < 1e-10 and abs(res2[24,:] - -2.8) < 1e-10, \
        "incorrect values"

    # H2OFrame[int,int]
    assert abs(res[13, 3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res4 = res[12, 0:4]
    assert abs(res4[0,0] - -2.8) < 1e-10 and abs(res4[0,1] - -1.0) < 1e-10 and abs(res4[0,2] - 0.6) < 1e-10 and \
        abs(res4[0,3] - 1.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, int]
    res5 = res[5:9, 1]
    assert abs(res5[0,:] - -1.9) < 1e-10 and abs(res5[1,:] - -1.4) < 1e-10 and abs(res5[2,:] - -1.4) < 1e-10 and \
           abs(res5[3,:] - -0.9) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res = iris * 2
    res6 = res[5:9, 0:4]
    assert abs(res6[0,0] - 10.8) < 1e-10 and abs(res6[1,1] - 6.8) < 1e-10 and abs(res6[2,2] - 3.0) < 1e-10 and \
           abs(res6[3,3] - 0.4) < 1e-10, "incorrect values"
def link_functions_binomial(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Read in prostate data.")
	h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
	h2o_data.head()

	sm_data = pd.read_csv(zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")).as_matrix()
	sm_data_response = sm_data[:,2]
	sm_data_features = sm_data[:,[1,3,4,5,6,7,8,9]]

	print("Testing for family: BINOMIAL")
	print("Set variables for h2o.")
	myY = "CAPSULE"
	myX = ["ID","AGE","RACE","GLEASON","DCAPS","PSA","VOL","DPROS"]

	print("Create models with canonical link: LOGIT")
	h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY].asfactor(), family="binomial", link="logit",alpha=[0.5], Lambda=[0])
	sm_model = sm.GLM(endog=sm_data_response, exog=sm_data_features, family=sm.families.Binomial(sm.families.links.logit)).fit()

	print("Compare model deviances for link function logit")
	h2o_deviance = h2o_model._model_json['output']['residual_deviance'] / h2o_model._model_json['output']['null_deviance']
	sm_deviance = sm_model.deviance / sm_model.null_deviance
	assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
def link_correct_default(ip,port):
	# Connect to h2o
	h2o.init(ip,port)

	print("Reading in original prostate data.")
	h2o_data = h2o.upload_file(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))

	print("Compare models with link unspecified and canonical link specified.")
	print("GAUSSIAN: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian")
	h2o_model_specified = h2o.glm(x=h2o_data[1:8], y=h2o_data[8], family="gaussian", link="identity")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("BINOMIAL: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="binomial", link="logit")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("POISSON: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson")
	h2o_model_specified = h2o.glm(x=h2o_data[2:9], y=h2o_data[1], family="poisson", link="log")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"

	print("GAMMA: ")
	h2o_model_unspecified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma")
	h2o_model_specified = h2o.glm(x=h2o_data[3:9], y=h2o_data[2], family="gamma", link="inverse")
	assert h2o_model_specified._model_json['output']['coefficients_table'].cell_values == \
		   h2o_model_unspecified._model_json['output']['coefficients_table'].cell_values, "coefficient should be equal"
Ejemplo n.º 5
0
def https_import(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    url = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate.csv.zip"
    aa = h2o.import_frame(path=url)
    aa.show()
    def test_parse_covtype20x_loop_s3n_hdfs(self):
        bucket = 'home-0xdiag-datasets'
        importFolderPath = "standard"
        csvFilename = "covtype20x.data"
        csvPathname = importFolderPath + "/" + csvFilename
        timeoutSecs = 500
        trialMax = 3
        for tryHeap in [4,12]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import folder,", \
                "then parse 'covtype20x.data'"
            h2o.init(java_heap_GB=tryHeap)
            # don't raise exception if we find something bad in h2o stdout/stderr?
            h2o.nodes[0].sandboxIgnoreErrors = True

            for trial in range(trialMax):
                hex_key = csvFilename + ".hex"
                start = time.time()
                parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='s3n', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start
                print "parse result:", parseResult['destination_key']
                print "Trial #", trial, "completed in", elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                removeKeyResult = h2o.nodes[0].remove_key(key=hex_key)

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
def iris_h2o_vs_sciKmeans(ip,port):
  # Connect to a pre-existing cluster
  h2o.init(ip,port)  # connect to localhost:54321

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))
  iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',')
  iris_sci = iris_sci[:,0:4]

  s =[[4.9,3.0,1.4,0.2],
  [5.6,2.5,3.9,1.1],
  [6.5,3.0,5.2,2.0]]

  start = h2o.H2OFrame(s)
  start_key = start.send_frame()

  h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False)

  sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1)
  sci_km.fit(iris_sci)

  # Log.info("Cluster centers from H2O:")
  print "Cluster centers from H2O:"
  h2o_centers = h2o_km.centers()
  print h2o_centers

  # Log.info("Cluster centers from scikit:")
  print "Cluster centers from scikit:"
  sci_centers = sci_km.cluster_centers_.tolist()
  print sci_centers

  for hcenter, scenter in zip(h2o_centers, sci_centers):
    for hpoint, spoint in zip(hcenter,scenter):
      assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
  def __init__(self, model_name, model_base_path, verbose=False):
    """
    Initialize the service.
        
    Args:
      model_name: The name of the model.
      model_base_path: The file path of the model.
    Return:
      None
    """
    super(H2oInferenceService, self).__init__()

    self.model_name = model_name
    self.model_base_path = model_base_path
    self.model_version_list = [1]
    self.model_graph_signature = ""
    self.platform = "H2o"
    self.verbose = verbose

    import h2o

    logging.info("Try to initialize and connect the h2o server")
    h2o.init()

    logging.info("Try to load the h2o model")
    model = h2o.load_model(model_base_path)

    self.model = model
    # TODO: Update the signature with readable string
    self.model_graph_signature = "{}".format(self.model.full_parameters)
Ejemplo n.º 9
0
def center_scale(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris =  h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))[0:4]

    # frame (default args)
    foo = iris.scale()
    # TODO: the below assertion fails. Should it?
    #assert abs(foo[0,0] - -0.8976739) < 1e-6 and  abs(foo[0,1] - 1.01560199) < 1e-6 and abs(foo[0,2] - -1.335752) < 1e-6 \
    #       and abs(foo[0,3] - -1.311052) < 1e-6, "h2o differed from r. h2o got {0}, {1}, {2}, and {3}" \
    #                                             "".format(foo[0,0],foo[0,1],foo[0,2],foo[0,3])

    # frame (centers=True, scale=False)
    foo = iris.scale(center=True, scale=False)

    # frame (centers=False, scale=True)
    foo = iris.scale(center=False, scale=True)

    # frame (centers=False, scale=False)
    foo = iris.scale(center=False, scale=False)

    # vec (default args)
    foo = iris[0].scale()

    # vec (centers=True, scale=False)
    foo = iris[1].scale(center=True, scale=False)

    # vec (centers=False, scale=True)
    foo = iris[2].scale(center=False, scale=True)

    # vec (centers=False, scale=False)
    foo = iris[3].scale(center=False, scale=False)
def covtype_get_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_frame(path=h2o.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = range(0,20) + range(29,54)

    # Set response to be indicator of a particular class
    res_class = random.randint(1,4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
Ejemplo n.º 11
0
def trim_check(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    frame = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_trim.csv"))

    # single column (frame)
    trimmed_frame = frame["name"].trim()
    assert trimmed_frame[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format(
        trimmed_frame[0, 0]
    )
    assert trimmed_frame[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format(
        trimmed_frame[1, 0]
    )
    assert trimmed_frame[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format(
        trimmed_frame[2, 0]
    )

    # single column (vec)
    vec = frame["name"]
    trimmed_vec = vec.trim()
    assert trimmed_vec[0, 0] == "AMC Ambassador Brougham", "Expected 'AMC Ambassador Brougham', but got {}".format(
        trimmed_frame[0, 0]
    )
    assert trimmed_vec[1, 0] == "AMC Ambassador DPL", "Expected 'AMC Ambassador DPL', but got {}".format(
        trimmed_frame[1, 0]
    )
    assert trimmed_vec[2, 0] == "AMC Ambassador SST", "Expected 'AMC Ambassador SST', but got {}".format(
        trimmed_frame[2, 0]
    )
Ejemplo n.º 12
0
def group_by(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv"))

    h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"]
    na_handling = ["ignore","rm","all"]
    col_names = h2o_iris.col_names()[0:4]

    print "Running smoke test"

    # smoke test
    for a in h2o_agg_funcs:
       for n in na_handling:
           for c in col_names:
               print "group by : " + str(a) + "; " + str(n) + "; " + str(c)
               h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]})

    # h2o/pandas/numpy comparison test
    h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum}
    for k in h2o_np_agg_dict.keys():
        for c in col_names:
            print "group by comparison: " + str(k) + "; " + str(c)
            h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]})
            pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k])
            for i in range(3):
                h2o_val = h2o_res[i,1]
                pd_val = pd_res[h2o_res[i,0]]
                assert abs(h2o_val - pd_val) < 1e-06, \
                    "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} " \
                    "values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
Ejemplo n.º 13
0
    def test_parse_nflx_loop_hdfs_fvec(self):
        print "Using the -.gz files from hdfs"
        # hdfs://<name node>/datasets/manyfiles-nflx-gz/file_1.dat.gz
        csvFilename = "file_10.dat.gz"
        csvFilepattern = "file_1[0-9].dat.gz"

        trialMax = 2
        for tryHeap in [24]:
            print "\n", tryHeap,"GB heap, 1 jvm per host, import mr-0x6 hdfs, then parse"
            h2o.init(java_heap_GB=tryHeap, random_udp_drop=RANDOM_UDP_DROP, use_hdfs=True, hdfs_name_node='mr-0x6', hdfs_version='cdh4')

            timeoutSecs = 500
            importFolderPath = "datasets/manyfiles-nflx-gz"
            for trial in range(trialMax):
                hex_key = csvFilename + "_" + str(trial) + ".hex"
                csvFilePattern = 'file_1.dat.gz'
                # "key": "hdfs://172.16.2.176/datasets/manyfiles-nflx-gz/file_99.dat.gz", 

                csvPathname = importFolderPath + "/" + csvFilePattern
                start = time.time()
                parseResult = h2i.import_parse(path=csvPathname, schema='hdfs', hex_key=hex_key,
                    timeoutSecs=timeoutSecs, retryDelaySecs=10, pollTimeoutSecs=60)
                elapsed = time.time() - start

                print "parse result:", parseResult['destination_key']
                print "Parse #", trial, "completed in", "%6.2f" % elapsed, "seconds.", \
                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

                h2o_cmd.runStoreView()

            h2o.tear_down_cloud()
            # sticky ports? wait a bit.
            time.sleep(5)
Ejemplo n.º 14
0
def sdev(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"),
                          delimiter=',',
                          skip_header=1,
                          usecols=(0, 1, 2, 3))

  sd_np = np.std(iris_np, axis=0, ddof=1)
  for i in range(4):
    sd_h2o = iris_h2o[i].sd()
    assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same"

  try:
    iris_h2o[4].sd()
    assert False, "expected an error. column is categorical."
  except EnvironmentError:
    assert True

  try:
    iris_h2o[0:2].sd()
    assert False, "expected an error. more than one column."
  except EnvironmentError:
    assert True
Ejemplo n.º 15
0
def swpredsRF(ip,port):
    # Training set has two predictor columns
    # X1: 10 categorical levels, 100 observations per level; X2: Unif(0,1) noise
    # Ratio of y = 1 per Level: cat01 = 1.0 (strong predictor), cat02 to cat10 = 0.5 (weak predictors)

    # Connect to h2o
    h2o.init(ip,port)

    #Log.info("Importing swpreds_1000x3.csv data...\n")
    swpreds = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/swpreds_1000x3.csv"))
    swpreds["y"] = swpreds["y"].asfactor()

    #Log.info("Summary of swpreds_1000x3.csv from H2O:\n")
    #swpreds.summary()

    # Train H2O DRF without Noise Column
    #Log.info("Distributed Random Forest with only Predictor Column")
    model1 = h2o.random_forest(x=swpreds[["X1"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model1.show()
    perf1 = model1.model_performance(swpreds)
    print(perf1.auc())

    # Train H2O DRF Model including Noise Column:
    #Log.info("Distributed Random Forest including Noise Column")
    model2 = h2o.random_forest(x=swpreds[["X1","X2"]], y=swpreds["y"], ntrees=50, max_depth=20, nbins=500)
    model2.show()
    perf2 = model2.model_performance(swpreds)
    print(perf2.auc())
Ejemplo n.º 16
0
def table_check(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    # single column (frame)
    table1 = h2o.table(iris[["C5"]])
    assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
    assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
    assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

    # single column (vec)
    table1 = h2o.table(iris["C5"])
    assert table1[0,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[0,0], table1[0,1])
    assert table1[1,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[1,0], table1[1,1])
    assert table1[2,1] == 50, "Expected 50 of {0}, but got {1}".format(table1[2,0], table1[2,1])

    # two-column (one argument)
    table2 = h2o.table(iris[["C1", "C5"]])
    assert table2[0,2] == 4, "Expected , but got {0}".format(table2[0,2])
    assert table2[1,2] == 5, "Expected , but got {0}".format(table2[1,2])
    assert table2[2,2] == 3, "Expected , but got {0}".format(table2[2,2])

    # two columns (seperate arguments (frames))
    table3 = h2o.table(iris[["C1"]],iris[["C5"]])
    assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2])
    assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2])
    assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2])

    # two columns (seperate arguments (vecs))
    table3 = h2o.table(iris["C1"],iris["C5"])
    assert table3[0,2] == 4, "Expected , but got {0}".format(table3[0,2])
    assert table3[1,2] == 5, "Expected , but got {0}".format(table3[1,2])
    assert table3[2,2] == 3, "Expected , but got {0}".format(table3[2,2])
def link_functions_tweedie_vpow(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # Load example data from HDtweedie, y = aggregate claim loss
    hdf = h2o.upload_file(h2o.locate("smalldata/glm_test/auto.csv"))
    y = "y"
    x = list(set(hdf.names()) - set(["y"]))

    print "Testing for family: TWEEDIE"
    print "Create models with canonical link: TWEEDIE"
    # Iterate over different variance powers for tweedie
    vpower = [0, 1, 1.5]
    r_dev = [0.7516627, 0.6708826, 0.7733762]
    r_null = [221051.88369951, 32296.29783702, 20229.47425307]
    for ridx, vpow in enumerate(vpower):
        print "Fit h2o.glm:"
        h2ofit = h2o.glm(x=hdf[x], y=hdf[y], family="tweedie", link="tweedie", tweedie_variance_power=vpow, tweedie_link_power=1-vpow,
                         alpha=[0.5], Lambda=[0])

        print "Testing Tweedie variance power: {0}".format(vpow)

        print "Compare model deviances for link function tweedie"
        deviance_h2o_tweedie = h2ofit.residual_deviance() / h2ofit.null_deviance()

        assert r_dev[ridx] - deviance_h2o_tweedie <= 0.01, "h2o's residual/null deviance is more than 0.01 lower than " \
                                                           "R's. h2o: {0}, r: {1}".format(deviance_h2o_tweedie, r_dev[ridx])

        print "compare null and residual deviance between R glm and h2o.glm for tweedie"
        assert abs(r_null[ridx] - h2ofit.null_deviance()) < 1e-6, "h2o's null deviance is not equal to R's. h2o: {0}, r: " \
                                                                   "{1}".format(h2ofit.null_deviance(), r_null[ridx])
def deep_learning_metrics_test(ip, port):
    h2o.init(ip, port)  # connect to existing cluster

    df = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    df.drop("ID")  # remove ID
    df["CAPSULE"] = df["CAPSULE"].asfactor()  # make CAPSULE categorical
    vol = df["VOL"]
    vol[vol == 0] = float("nan")  # 0 VOL means 'missing'

    r = vol.runif()  # random train/test split
    train = df[r < 0.8]
    test = df[r >= 0.8]

    # See that the data is ready
    train.describe()
    train.head()
    train.tail()
    test.describe()
    test.head()
    test.tail()

    # Run DeepLearning
    print "Train a Deeplearning model: "
    dl = h2o.deeplearning(x=train[1:], y=train["CAPSULE"], epochs=100, hidden=[10, 10, 10], loss="CrossEntropy")
    print "Binomial Model Metrics: "
    print
    dl.show()
    dl.model_performance(test).show()
Ejemplo n.º 19
0
def smallcatGBM(ip,port):
  # Training set has 26 categories from A to Z
  # Categories A, C, E, G, ... are perfect predictors of y = 1
  # Categories B, D, F, H, ... are perfect predictors of y = 0

  # Connect to h2o
  h2o.init(ip,port)

  #Log.info("Importing alphabet_cattest.csv data...\n")
  alphabet = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"))
  alphabet["y"] = alphabet["y"].asfactor()
  #Log.info("Summary of alphabet_cattest.csv from H2O:\n")
  #alphabet.summary()

  # Prepare data for scikit use
  trainData = np.loadtxt(h2o.locate("smalldata/gbm_test/alphabet_cattest.csv"), delimiter=',', skiprows=1,
                         converters={0:lambda s: ord(s.split("\"")[1])})
  trainDataResponse = trainData[:,1]
  trainDataFeatures = trainData[:,0]
  
  # Train H2O GBM Model:
  #Log.info("H2O GBM (Naive Split) with parameters:\nntrees = 1, max_depth = 1, nbins = 100\n")
  gbm_h2o = h2o.gbm(x=alphabet[['X']], y=alphabet["y"], distribution="bernoulli", ntrees=1, max_depth=1, nbins=100)
  gbm_h2o.show()
  
  # Train scikit GBM Model:
  # Log.info("scikit GBM with same parameters:")
  gbm_sci = ensemble.GradientBoostingClassifier(n_estimators=1, max_depth=1, max_features=None)
  gbm_sci.fit(trainDataFeatures[:,np.newaxis],trainDataResponse)
Ejemplo n.º 20
0
def pca_prostate(ip, port):
    h2o.init(ip, port)

    print "Importing prostate.csv data...\n"
    prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv"))

    print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors"
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate["RACE"] = prostate["RACE"].asfactor()
    prostate["DPROS"] = prostate["DPROS"].asfactor()
    prostate["DCAPS"] = prostate["DCAPS"].asfactor()
    prostate.describe()

    print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'"
    fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power")
    pred1 = fitPCA.predict(prostate)
    pred2 = h2o.get_frame(fitPCA._model_json['output']['loading_key']['name'])

    print "Compare dimensions of projection and loading matrix"
    print "Projection matrix:\n"
    print pred1.head()
    print "Loading matrix:\n"
    print pred2.head()
    assert pred1.nrow() == pred2.nrow(), "Expected same number of rows, but got {0} and {1}".format(pred1.nrow(),
                                                                                                    pred2.nrow())
    assert pred1.ncol() == pred2.ncol(), "Expected same number of rows, but got {0} and {1}".format(pred1.ncol(),
                                                                                                    pred2.ncol())
Ejemplo n.º 21
0
def slicing_shape(ip,port):
    # Connect to a pre-existing cluster
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))
    rows, cols = prostate.dim()

    #foo = prostate[0:0] # TODO: empty frame allowed?
    #foo.show()

    # prostate[slice]
    for ncols in range(1,cols+1):
        r, c = prostate[0:ncols].dim()
        assert r == rows, "incorrect number of rows. correct: {0}, computed: {1}".format(rows, r)
        assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)

    # prostate[int,slice]
    for ncols in range(1,cols+1):
        r, c = prostate[random.randint(0,rows-1),0:ncols].dim()
        assert r == 1, "incorrect number of rows. correct: {0}, computed: {1}".format(1, r)
        assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)

    # prostate[slice,int] # TODO: there's a bug here: HEXDEV-266
    for nrows in range(1,10):
       r, c = prostate[0:nrows,random.randint(0,cols-1)].dim()
       assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r)
       assert c == 1, "incorrect number of cols. correct: {0}, computed: {1}".format(1, c)

    # prostate[slice,slice] # TODO: there's a bug here: HEXDEV-266
    for nrows in range(1,10):
       for ncols in range(1,cols+1):
           r, c = prostate[0:nrows,0:ncols].dim()
           assert r == nrows, "incorrect number of rows. correct: {0}, computed: {1}".format(nrows, r)
           assert c == ncols, "incorrect number of cols. correct: {0}, computed: {1}".format(ncols, c)
Ejemplo n.º 22
0
def wide_dataset_large(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    print("Reading in Arcene training data for binomial modeling.")
    trainDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train_labels.labels"), delimiter=' ')
    trainDataResponse = np.where(trainDataResponse == -1, 0, 1)
    trainDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_train.data"), delimiter=' ')
    trainData = h2o.H2OFrame(np.column_stack((trainDataResponse, trainDataFeatures)).tolist())

    print("Run model on 3250 columns of Arcene with strong rules off.")
    model = h2o.glm(x=trainData[1:3250], y=trainData[0].asfactor(), family="binomial", lambda_search=False, alpha=[1])

    print("Test model on validation set.")
    validDataResponse = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid_labels.labels"), delimiter=' ')
    validDataResponse = np.where(validDataResponse == -1, 0, 1)
    validDataFeatures = np.genfromtxt(h2o.locate("smalldata/arcene/arcene_valid.data"), delimiter=' ')
    validData = h2o.H2OFrame(np.column_stack((validDataResponse, validDataFeatures)).tolist())
    prediction = model.predict(validData)

    print("Check performance of predictions.")
    performance = model.model_performance(validData)

    print("Check that prediction AUC better than guessing (0.5).")
    assert performance.auc() > 0.5, "predictions should be better then pure chance"
Ejemplo n.º 23
0
def frame_slicing(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip"))
    airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip"))
    iris.show()
    prostate.show()
    airlines.show()

    ###################################################################

    # H2OFrame[int] (column slice)
    res1 = iris[0]
    assert abs(res1[8] - 4.4) < 1e-10, "incorrect values"

    # H2OFrame[int,int]
    res2 = prostate[13, 3]
    assert abs(res2 - 1) < 1e-10, "incorrect values"

    # H2OFrame[int, slice]
    res3 = airlines[12, 0:3]
    assert abs(res3[0,0] - 1987) < 1e-10 and abs(res3[0,1] - 10) < 1e-10 and abs(res3[0,2] - 29) < 1e-10, \
        "incorrect values"

    # H2OFrame[slice, int]
    res4 = iris[5:8, 1]
    assert abs(res4[0] - 3.9) < 1e-10 and abs(res4[1] - 3.4) < 1e-10 and abs(res4[2] - 3.4) < 1e-10, "incorrect values"

    # H2OFrame[slice, slice]
    res5 = prostate[5:8, 0:3]
    assert abs(res5[0,0] - 6) < 1e-10 and abs(res5[1,1] - 0) < 1e-10 and abs(res5[2,2] - 61) < 1e-10, "incorrect values"
Ejemplo n.º 24
0
def fiftycatGBM(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  # Training set has only 45 categories cat1 through cat45
  #Log.info("Importing 50_cattest_train.csv data...\n")
  train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
  train["y"] = train["y"].asfactor()

  #Log.info("Summary of 50_cattest_train.csv from H2O:\n")
  #train.summary()
  
  # Train H2O GBM Model:
  #Log.info(paste("H2O GBM with parameters:\nntrees = 10, max_depth = 20, nbins = 20\n", sep = ""))
  model = h2o.gbm(x=train[["x1","x2"]], y=train["y"], loss="bernoulli", ntrees=10, max_depth=5, nbins=20)
  model.show()
 
  # Test dataset has all 50 categories cat1 through cat50
  #Log.info("Importing 50_cattest_test.csv data...\n")
  test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))
  #Log.info("Summary of 50_cattest_test.csv from H2O:\n")
  #test.summary()
  
  # Predict on test dataset with GBM model:
  #Log.info("Performing predictions on test dataset...\n")
  predictions = model.predict(test)
  predictions.show()
  
  # Get the confusion matrix and AUC
  #Log.info("Confusion matrix of predictions (max accuracy):\n")
  performance = model.model_performance(test)
  test_cm = performance.confusion_matrices()
  test_auc = performance.auc()
Ejemplo n.º 25
0
def expr_show(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    ###################################################################

    # expr[int], expr._data is pending
    res = 2 - iris
    res2 = res[0]
    print "res2:"
    res2.show()

    # expr[int], expr._data is remote
    res3 = res[0]
    print "res3:"
    res3.show()

    # expr[int], expr._data is local
    expr = Expr([1,2,3])
    print "expr:"
    expr.show()

    # expr[tuple], expr._data is local
    expr = Expr([[1,2,3], [4,5,6]])
    print "expr:"
    expr.show()
Ejemplo n.º 26
0
def ls_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.ls()
Ejemplo n.º 27
0
def h2oinitname():
    """
    Python API test for h2o.init
    :return:
    """
    try:
        h2o.init(strict_version_check=False, name="test")  # Should initialize
        h2o.init(strict_version_check=False, name="test")  # Should just connect
        assert h2o.cluster().cloud_name == "test"
    except H2OConnectionError as e:  # some errors are okay like version mismatch
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=54321, name="test2", as_port=True)
        assert False, "Should fail to connect and the port should be used by previous invocation."
    except H2OServerError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=54321, name="test2")  # Should bump the port to next one
        assert h2o.cluster().cloud_name == "test2"
    except H2OConnectionError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))

    try:
        h2o.init(strict_version_check=False, port=60000, name="test3", as_port=True)
        assert h2o.cluster().cloud_name == "test3"
    except H2OConnectionError as e:
        print("error message type is {0} and the error message is {1}\n".format(e.__class__.__name__, e.args[0]))
        assert_is_type(e, H2OConnectionError)
        h2o.cluster().shutdown()
Ejemplo n.º 28
0
 def setUpClass(cls):
     # aws_credentials='~/.ec2/AwsCredentials.properties',
     # hdfs_config="~/.ec2/core-site.xml",
     # java_extra_args='-XX:+PrintGCDetails')
     # use_hdfs=True,
     # Uses your username specific json: pytest_config-<username>.json
     h2o.init(1, java_heap_GB=28)
Ejemplo n.º 29
0
def fiftycatRF(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Training set has only 45 categories cat1 through cat45
    # Log.info("Importing 50_cattest_train.csv data...\n")
    train = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_train.csv"))
    train["y"] = train["y"].asfactor()

    # Log.info("Summary of 50_cattest_train.csv from H2O:\n")
    # train.summary()

    # Train H2O DRF Model:
    # Log.info(paste("H2O DRF with parameters:\nclassification = TRUE, ntree = 50, depth = 20, nbins = 500\n", sep = ""))
    model = h2o.random_forest(x=train[["x1", "x2"]], y=train["y"], ntrees=50, max_depth=20, nbins=500)

    # Test dataset has all 50 categories cat1 through cat50
    # Log.info("Importing 50_cattest_test.csv data...\n")
    test = h2o.import_frame(path=h2o.locate("smalldata/gbm_test/50_cattest_test.csv"))

    # Log.info("Summary of 50_cattest_test.csv from H2O:\n")
    # test.summary()

    # Predict on test dataset with DRF model:
    # Log.info("Performing predictions on test dataset...\n")
    preds = model.predict(test)
    preds.head()

    # Get the confusion matrix and AUC
    # Log.info("Confusion matrix of predictions (max accuracy):\n")
    perf = model.model_performance(test)
    perf.show()
    cm = perf.confusion_matrix()
    print(cm)
def link_functions_gaussian(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    print("Read in prostate data.")
    h2o_data = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate_complete.csv.zip"))
    h2o_data.head()

    sm_data = pd.read_csv(
        zipfile.ZipFile(h2o.locate("smalldata/prostate/prostate_complete.csv.zip")).open("prostate_complete.csv")
    ).as_matrix()
    sm_data_response = sm_data[:, 9]
    sm_data_features = sm_data[:, 1:9]

    print("Testing for family: GAUSSIAN")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    print("Create models with canonical link: IDENTITY")
    h2o_model = h2o.glm(x=h2o_data[myX], y=h2o_data[myY], family="gaussian", link="identity", alpha=[0.5], Lambda=[0])
    sm_model = sm.GLM(
        endog=sm_data_response, exog=sm_data_features, family=sm.families.Gaussian(sm.families.links.identity)
    ).fit()

    print("Compare model deviances for link function identity")
    h2o_deviance = h2o_model.residual_deviance() / h2o_model.null_deviance()
    sm_deviance = sm_model.deviance / sm_model.null_deviance
    assert h2o_deviance - sm_deviance < 0.01, "expected h2o to have an equivalent or better deviance measures"
Ejemplo n.º 31
0
 def setUpClass(cls):
     h2o.init()
Ejemplo n.º 32
0
 def setUpClass(cls):
     global SEED
     SEED = h2o.setup_random_seed()
     h2o.init()
Ejemplo n.º 33
0
import numpy as np
import pandas as pd
import h2o

print('Loading data')
h2o.init()
feats = ["id", 'era', 'data_type']
pred_columns = []
for i in range(50):
    pred_columns.append("feature" + str(i + 1).strip())
    feats.append("feature" + str(i + 1).strip())
feats.append("target")
df = h2o.import_file("../input/numerai_training_data.csv")

test = h2o.import_file('../input/numerai_tournament_data.csv')
#valid=test[test['data_type']=='validation']

from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

#GBM=H2OGradientBoostingEstimator(
#        ntrees=10,
#        learn_rate=0.2,
#        learn_rate_annealing = 0.99,
#        sample_rate = 0.8,
#        col_sample_rate = 0.8,
#        seed = 1234,
Ejemplo n.º 34
0
def load_h2o():
    h2o.init()
Ejemplo n.º 35
0
# Customarily, we import and start H2O as follows:
import h2o

h2o.init()  # Will set up H2O cluster using all available cores

h2o.init(ip="123.45.67.89", port=54321)
# To create an H2OFrame object from a python tuple:
df = h2o.H2OFrame(zip(*((1, 2, 3), ('a', 'b', 'c'), (0.1, 0.2, 0.3))))
df
# To create an H2OFrame object from a python list:
df = h2o.H2OFrame(zip(*[[1, 2, 3], ['a', 'b', 'c'], [0.1, 0.2, 0.3]]))
df
# To create an H2OFrame object from a python dict (or collections.OrderedDict):
df = h2o.H2OFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c'], 'C': [0.1, 0.2, 0.3]})
df

# To create an H2OFrame object from a dict with specified column types:
df2 = h2o.H2OFrame.from_python(
    {
        'A': [1, 2, 3],
        'B': ['a', 'a', 'b'],
        'C': ['hello', 'all', 'world'],
        'D':
        ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']
    },
    column_types=['numeric', 'enum', 'string', 'time'])

df2

df2.types
Ejemplo n.º 36
0
import pandas as pd
import time
import numpy as np
import pickle
import h2o
from sklearn.metrics import roc_auc_score
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

since = time.time()
h2o.init(nthreads=-1)

data_dir = '../data/'
save_dir = '../saves/'
read_from = save_dir

load_name = 'train_best.csv'
load_name = load_name[:-4]

dt = pickle.load(open(read_from+load_name+'_dict.save', "rb"))
df = pd.read_csv(read_from+load_name+".csv", dtype=dt)
del dt

print()
print('>'*20)
print('>'*20)
print('dtypes of df:')

print(df.dtypes)
Ejemplo n.º 37
0
# Parse command-line args.
#
# usage:  python test_name.py --usecloud ipaddr:port
#

ip_port = sys.argv[2].split(":")
print ip_port
ip = ip_port[0]
port = int(ip_port[1])

######################################################
#
# Sample Running GBM on prostate.csv

# Connect to a pre-existing cluster
h2o.init(ip=ip, port=port)

df = h2o.import_frame(path="../../../smalldata/logreg/prostate.csv")
df.describe()

# Remove ID from training frame
del df['ID']

# For VOL & GLEASON, a zero really means "missing"
vol = df['VOL']
vol[vol == 0] = None
gle = df['GLEASON']
gle[gle == 0] = None

# Convert CAPSULE to a logical factor
df['CAPSULE'] = df['CAPSULE'].asfactor()
Ejemplo n.º 38
0
def main():
    script = sys.argv[0]
    filename = sys.argv[1]
    remove_stops = int(sys.argv[2])
    lemmatize = int(sys.argv[3])
    window = int(sys.argv[4])
    vectorsize = int(sys.argv[5])

    # reading the main data file
    pap = pd.read_csv(filename)
    print 'Shape of data file: ', pap.shape
    print 'Column titles: ', pap.columns
    #pap.head(1)

    # In[4]:

    # Adding some manual stop words
    manuals = ['et', 'al', 'page']
    # Only keeping Nouns, adjectives and verbs
    keeplist = [
        'NNS', 'VBP', 'VBN', 'NN', 'VBD', 'VBZ', 'VBG', 'JJ', 'VB', 'JJR',
        'JJS', 'NNP', 'NNPS'
    ]
    msh = prep(pap,
               lemmatize=lemmatize,
               remove_stops=remove_stops,
               manuals=manuals,
               keeplist=keeplist)
    #print msh.shape
    #msh.head()

    # In[5]:

    # Making a list corpus of tokenized papers
    print 'Making a corpus of the tokenized papers'
    papers = []
    for i in msh.paper_id.unique():
        papers = papers + [list(msh[msh.paper_id == i].word)]

    # In[6]:

    # Make bigrams of words
    def make_bigrams(texts, bigram_mod):
        return [bigram_mod[doc] for doc in texts]

    # Make trigrams of words
    def make_trigrams(texts, bigram_mod, trigram_mod):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]
        # Build the bigram and trigram models

    bigram = gensim.models.Phrases(
        papers, min_count=5, threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[papers], threshold=100)

    #trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # making Tri-bigrams
    print 'Making Trigrams of the corpus'
    papers_trigrams = make_trigrams(make_bigrams(papers,
                                                 bigram_mod=bigram_mod),
                                    bigram_mod=bigram_mod,
                                    trigram_mod=trigram_mod)

    # Just making a replica array of papers_trigrams but with the paper_ids
    j = 0
    paper_id = []
    for i in msh.paper_id.unique():
        paper_id = paper_id + [[i] * len(papers_trigrams[j])]
        j = j + 1

    # ### Training a FastTex embedding on our Tri-gram version of corpus.
    # I choose size of 1000, window width of 5 and minimum word count of 3. These choices are not optimized.
    # The result will be saved in a text file provided in "name" variable.

    # In[7]:

    print 'Training the FastTex embeddings and saving the result in "nips_fasttex.txt"'
    print 'window = ', window, 'vectorsize= ', vectorsize
    msh_fastex = train_fasttex(papers_trigrams,
                               min_count=3,
                               window=window,
                               size=vectorsize,
                               name='nips_fasttex.txt')

    # ### Preparing data
    # At this stage, I will use the fast implementation of a K-means clustering algorithm in H2O machine learning platform
    # to cluster all extracted words in our corpus based on their trained embedings. These clusters can be used for multiple purposes:
    # 1. The top most frequent keywords in each cluster can be used as potential important keywords related to the context of papers and field.
    # In other words, using this techique, we should be able to extract most relevant keywords of the corpus that represent the NIPS conference. We also will know the context of each cluster of keywords.
    # One advatage of this methodology is that not only we will capture the dominant and most frequent sets of keywords, we also will identify small subsets of keywords related to a technical field that only a few papers have talked about.
    # 2. We later will use these word cluster as a proxy to probability of belonging each paper to each of these clusters.

    # In[9]:

    h2o.init()

    # In[10]:

    print 'Converting data to H2O dataframe.'
    X = msh_fastex[msh_fastex.wv.vocab]
    vocabs = np.array(list(msh_fastex.wv.vocab))
    result = pd.DataFrame(X)
    result['word'] = vocabs
    result = result.set_index('word')
    newwd_h = h2o.H2OFrame(result.reset_index(),
                           column_names=list(
                               result.reset_index().columns.astype(str)))
    #result.head()

    # ### Kmeans clustering on words
    # The clustering of keywords happen here. I have desinged this code woith capability of optimizing for number of clusters based on the minimum value of the BIC parameter.
    # We can provide minimum and maximum numbers of k and the step interval. Then the algorithm perform a grid search over the parameter space and choose the k value that minimizes BIC.
    # However, for sake of simplicity, here I just randomly choose 20 clusters and work with that.
    # The results will be saved in a data frame called: "newpd"

    # In[11]:

    print 'Running Kmeans clustering of word embeddings'
    minn = 15
    maxx = 16
    step = 1
    results = [
        H2OKMeansEstimator(k=clusters,
                           init="PlusPlus",
                           seed=2,
                           standardize=True)
        for clusters in range(minn, maxx, step)
    ]
    for estimator in results:
        estimator.train(x=list(pd.DataFrame(result).columns),
                        training_frame=newwd_h[1:])

    diagnostics = pd.DataFrame(
        [diagnostics_from_clusteringmodel(model) for model in results])
    diagnostics.set_index('Clusters', inplace=True)

    best_cluster = diagnostics[diagnostics['BIC'] ==
                               diagnostics['BIC'].min()].index[0]
    print 'Number of clusters used, K: ', best_cluster
    # print results
    predicted = results[(best_cluster - minn) / step].predict(newwd_h)
    newwd_h["Cluster"] = predicted["predict"].asnumeric()
    newpd = newwd_h.as_data_frame(True)
    #newpd.head()

    # In[12]:
    """
    Here is list the word clusters, their unique word counts and their share of unique words in the whole corpus.
    """
    print 'Results: '
    member_count = newpd['Cluster'].value_counts()

    def report(a1, a2, a3):
        return 'Cluster: ' + str(a1) + ' ,Member counts: ' + str(
            a2) + ' ,Member share: %' + "%.2f" % a3

    print[
        report(member_count.index[i], member_count.iloc[i],
               100.0 * member_count.iloc[i] / member_count.sum())
        for i in range(member_count.shape[0])
    ]

    # ### Word Cloud
    # To show the result of our clustering I use the word cloud visualization and save the output in a pdf file with the provided filename.
    # I also calculate the weight of each word within its own cluster and output the result in a new dataframe.
    # Words with the highest weights in each cluster are the most frequent ones within their clusters and are shown with larger fonts.
    # Note: Although most of these words are very relevant and gives us useful intuition and information about the context and content of the corpus,
    # it does not gaurantee every single one of them is useful! So, overall, there will be clusters or words that might not necessarly carry any usefull information abpout the content of the corpus. However, it only takes 10 seconds for a human to identify them.

    # In[38]:

    print 'Generating a pdf Wordcloud diagram "Cluster.pdf" with the top 10 words in each cluster'
    newpd2 = cloud_plot(newpd, papers_trigrams, 'Clusters.pdf')

    # In[39]:

    #Here I output the first top 10 words of each cluster and save it in the text file "top10words.txt" Clusters are sorted with respect to their size.
    print 'Saving the list of top 10 words for each cluster in "top10words.csv" file'

    def top10(df):
        return df.sort_values('weight', ascending=False).index.values[0:11]

    top10words = newpd2.groupby(['Cluster'
                                 ]).apply(top10)[member_count.index.values]
    top10words.to_csv('top10words.csv')
    print top10words

    # In[40]:

    print 'Generating a new corpus with Trigrams'
    # Since I made trigrams of the original corpus, I need to make a new corpus so I can use it in the following.
    flat_paper_id = pd.Series(
        [item for sublist in paper_id for item in sublist])
    flat_word = pd.Series(
        [item for sublist in papers_trigrams for item in sublist])
    newmsh = pd.DataFrame({'paper_id': flat_paper_id, 'word': flat_word})
    # Here I join the result of the clustering to the new paper corpus.
    newmsh = newmsh.set_index('word').join(newpd2, how='inner')

    # Here, I go through each paper and identify what perntage of its content is associated to each word-cluster. I output a csv file "pap_cluster_share.csv"
    # that has paper_id, cluster id and percentage of each clsuter.
    # This is an adhoc approximation for understanding the content of each paper. For example we can say probablity of assigning paper 5633 to cluster 12, 2 and 4 is ..., ... and ...
    # Note: The best method for doing probabilistic topic modeling is to use the LDA or LDAtoVec algorithms. Here I'm doing probabilistic topic modeling with an adhoc method.

    # In[41]:

    print 'Calculating the share of each word-cluster in each paper. Saving the results in "pap_cluster_share.csv". The top 30 rows are shown in following. '
    pap_cluster_share = newmsh.groupby(
        ['paper_id',
         'Cluster']).weight.sum().reset_index().rename(columns={
             'weight': 'weight_sum'
         }).sort_values(['paper_id', 'weight_sum', 'Cluster'], ascending=False)
    sharepct = pap_cluster_share.groupby('paper_id').weight_sum.sum()
    pap_cluster_share = pap_cluster_share.set_index('paper_id').join(
        sharepct, rsuffix='_cumsum')
    pap_cluster_share['share_pct'] = 100.0 * pap_cluster_share[
        'weight_sum'] / pap_cluster_share['weight_sum_cumsum']
    pap_cluster_share = pap_cluster_share.drop(
        ['weight_sum', 'weight_sum_cumsum'], axis=1)
    pap_cluster_share.to_csv('pap_cluster_share.csv', index=True)
    print pap_cluster_share.head(30)

    # ## Another approach: Vector representation of papers.
    # #### Trying PCA on each paper.
    # The above approach has some caveats and if we don't use it inteligently it might make our life more complicated. For example the cluster with the majority of words will always be cluster number 1 for each paper.
    # To avoid these complications and simply cluster papers into a few topics I use a different approach.
    # I laverage our FastTex embeddings and use PCA to make vector representations of papers. The I simply run a k-means clustering on the vector representation of papers.
    # I also use the BIC parameter for model selection and automatically find the optimal number of topics among papers.
    # In the following I treat each word as a feature and each FastTex columns as a data row.
    # The result is an embedding vector for each paper.

    # In[42]:

    print 'Another approach for clustering papers: Generating paper embedding vectors using the PCA analysis.'
    cols = ['paper_id'] + list(np.arange(1000).astype(str))
    msh2_pca = pd.DataFrame([])
    pca = PCA(n_components=1)
    for paper_id in newmsh.paper_id.unique():
        result = pca.fit_transform(
            newmsh[newmsh.paper_id == paper_id].loc[:, cols].T.iloc[1:, :])
        msh2_pca = pd.concat([
            msh2_pca,
            pd.DataFrame(result).rename(columns={0: str(paper_id)})
        ],
                             axis=1)
    msh2_pca = msh2_pca.T.reset_index().rename(columns={'index': 'paper_id'})
    #msh2_pca.head()

    # In[43]:

    print 'Running Kmeans clustering on paper embedings and automatically finding the optimal number of clusters in the provided range.'
    newwd_h = h2o.H2OFrame(msh2_pca,
                           column_names=list(msh2_pca.columns.astype(str)))
    minn = 2
    maxx = 20
    step = 2
    results = [
        H2OKMeansEstimator(k=clusters,
                           init="PlusPlus",
                           seed=2,
                           standardize=True)
        for clusters in range(minn, maxx, step)
    ]
    for estimator in results:
        estimator.train(x=list(pd.DataFrame(msh2_pca.iloc[:, 1:]).columns),
                        training_frame=newwd_h[1:])

    diagnostics = pd.DataFrame(
        [diagnostics_from_clusteringmodel(model) for model in results])
    diagnostics.set_index('Clusters', inplace=True)
    diagnostics.plot(kind='line')

    best_cluster = diagnostics[diagnostics['BIC'] ==
                               diagnostics['BIC'].min()].index[0]
    print 'Number of topics K ', best_cluster
    # print results
    predicted = results[(best_cluster - minn) / step].predict(newwd_h)
    newwd_h["Cluster_PCA"] = predicted["predict"].asnumeric()
    newdocs2 = newwd_h.as_data_frame(True)
    #newdocs2.head()

    # #### The results of clustering, cluster ID and its member count

    # In[44]:

    print 'Results of paper clustering: '
    member_count = newdocs2['Cluster_PCA'].value_counts()
    print[
        report(member_count.index[i], member_count.iloc[i],
               100.0 * member_count.iloc[i] / member_count.sum())
        for i in range(member_count.shape[0])
    ]

    # #### Final results
    # I join the dataframe newdocs2 to our original dataframe pap to get paper titles and abstracts. I save the final clustering result as well as the paper embeddings in the Paper_Embedding_Cluster_PCA.csv file.

    # In[45]:

    print 'Joining the results to the original data file and saving the results in "Paper_Embedding_Cluster_PCA.csv" file. This file contains paper embeddings, associated clusters, titles and abstracts.'
    newdocs2 = newdocs2.set_index('paper_id').join(
        pap[['Id', 'Title', 'Abstract']].set_index('Id'), how='inner')
    newdocs2 = newdocs2.reset_index().rename(columns={'index': 'paper_id'})
    newdocs2.to_csv('Paper_Embedding_Cluster_PCA.csv', index=False)

    # In[46]:

    print 'Example: Top 20 rows of saved dataframe'
    print newdocs2[['Cluster_PCA',
                    'Title']].sort_values('Cluster_PCA').head(20)

    # #### Making a TSNE 2 dimentional plot with cluster IDs

    # In[47]:

    print 'genrating a 2D TSNE visualization of paper clusters, "TSNE_papers_PCA2.pdf". Cluster IDs are used as data point labels.'
    tsne_plot(newdocs2.iloc[:, :-2].set_index('Cluster_PCA').reset_index(),
              'TSNE_papers_PCA2.pdf')

    print 'Done!'
Ejemplo n.º 39
0
 def setUpClass(cls):
     # assume we're at 0xdata with it's hdfs namenode
     h2o.init(1, java_heap_GB=14)
Ejemplo n.º 40
0
 def setUpClass(cls):
     global SEED
     SEED = h2o.setup_random_seed()
     h2o.init(1, java_heap_GB=14)
Ejemplo n.º 41
0
    i = 1
    while i <= 6:
        arg = str(argv[i])
        if arg == "--trainDataFile":
            train_data_file = str(argv[i+1])
        elif arg == "--memory":
            memory = str(argv[i+1])
        elif arg == "--target":
            target = str(argv[i+1])
        i += 2


if __name__ == "__main__":
    parse_args(sys.argv)

h2o.init(ip=socket.gethostbyname(socket.gethostname()), port="54321", start_h2o=False)

train = h2o.import_file(train_data_file)

x = train.columns
y = target
x.remove(y)

train[y] = train[y].asfactor()

aml = H2OAutoML(max_runtime_secs=60)
aml.train(x=x, y=y, training_frame=train)

lb = aml.leaderboard
print(lb)
Ejemplo n.º 42
0
                                                             codes)
    data_test.loc[:,
                  categorical_columns] = data_test.loc[:,
                                                       categorical_columns].apply(
                                                           lambda x: x.cat.
                                                           codes)
labels_train = data_train.pop(target_column_name)
features_train = data_train
labels_test = data_test.pop(target_column_name)
features_test = data_test

# Train the model.
if args.library == 'h2o':
    import h2o
    from h2o.estimators import H2OGradientBoostingEstimator
    h2o.init(max_mem_size=20480000 * 1000)
    data_train = pd.concat([features_train, labels_train], axis=1)
    data_test = pd.concat([features_test, labels_test], axis=1)
    data_train = h2o.H2OFrame(python_obj=data_train)
    data_test = h2o.H2OFrame(python_obj=data_test)
    feature_column_names = [
        column for column in data_train.columns if column != target_column_name
    ]
    model = H2OGradientBoostingEstimator(
        distribution="gaussian",
        learn_rate=0.1,
        ntrees=100,
    )
    model.train(
        training_frame=data_train,
        y=target_column_name,
Ejemplo n.º 43
0
 def setUpClass(cls):
     global SEED
     SEED = h2o.setup_random_seed()
     java_extra_args='-XX:+PrintGCDetails'
     h2o.init(1, java_heap_GB=10, java_extra_args=java_extra_args)
Ejemplo n.º 44
0
'''
A1 Benchmark
------------

This event flow is for generating gradient boost model for
classification approach
'''

import h2o
from h2o.estimators import H2OGradientBoostingEstimator

print 'A1 Benchmark'
print '------------'

# Initialize H2O server
h2o.init(max_mem_size_GB=5)

# Load train and test data as H2O frames
train = h2o.import_file('processed-data/A1Benchmark_train.csv')
test = h2o.import_file('processed-data/A1Benchmark_test.csv')

# Define input and response columns
response_column = 'is_anomaly'
input_columns = train.col_names
input_columns.remove(response_column)
input_columns.remove('timestamp')

print 'Input columns   :', input_columns
print 'Response column :', response_column

# Explicitly imply response column contains label data
Ejemplo n.º 45
0
 def setUpClass(cls):
     h2o.init(1, java_heap_GB=14)
Ejemplo n.º 46
0
def binop_plus(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader_65_rows.csv"))
    rows, cols = iris.dim()
    iris.show()

    ###################################################################

    # LHS: scaler, RHS: H2OFrame
    res = 2 + iris
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    for x, y in zip([res[c].sum() for c in range(cols-1)], [469.9, 342.6, 266.9, 162.2]):
        assert abs(x - y) < 1e-1,  "expected same values"

    # LHS: scaler, RHS: H2OVec
    res = 2 + iris[1]
    assert abs(sum([res[i] for i in range(rows)]) - 342.6) < 1e-1, "expected same values"

    # LHS: scaler, RHS: scaler
    res = 2 + iris[0]
    res2 = 1.1 + res[21]
    assert abs(res2 - 8.2) < 1e-1, "expected same values"

    ###################################################################

    # LHS: scaler, RHS: H2OFrame
    res = 1.2 + iris[2]
    res2 = res[21] + iris
    res2.show()


    # LHS: scaler, RHS: H2OVec
    res = 1.2 + iris[2]
    res2 = res[21] + iris[1]
    res2.show()

    # LHS: scaler, RHS: scaler
    res = 1.1 + iris[2]
    res2 = res[21] + res[10]
    assert abs(res2 - 5.2) < 1e-1, "expected same values"

    # LHS: scaler, RHS: scaler
    res = 2 + iris[0]
    res2 = res[21] + 3
    assert abs(res2 - 10.1) < 1e-1, "expected same values"

    ###################################################################

    # LHS: H2OVec, RHS: H2OFrame
    #try:
    #    res = iris[2] + iris
    #    res.show()
    #    assert False, "expected error. objects with different dimensions not supported."
    #except EnvironmentError:
    #    pass

    # LHS: H2OVec, RHS: H2OVec
    res = iris[0] + iris[1]
    assert abs(sum([res[i] for i in range(rows)]) - 552.5) < 1e-1, "expected same values"

    res = iris[2] + iris[1]
    assert abs(sum([res[i] for i in range(rows)]) - 349.5) < 1e-1, "expected same values"

    # LHS: H2OVec, RHS: scaler
    res = 1.2 + iris[2]
    res2 = iris[1] + res[21]
    res2.show()

    # LHS: H2OVec, RHS: scaler
    res = iris[0] + 2
    assert abs(sum([res[i] for i in range(rows)]) - 469.9) < 1e-2, "expected different column sum"

    ###################################################################

    # LHS: H2OFrame, RHS: H2OFrame
    res = iris + iris
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"

    res = iris[0:2] + iris[1:3]
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == 2, "dimension mismatch"

    #try:
    #    res = iris + iris[0:3]
    #    res.show()
    #    assert False, "expected error. frames are different dimensions."
    #except EnvironmentError:
    #    pass

    # LHS: H2OFrame, RHS: H2OVec
    #try:
    #    res = iris + iris[0]
    #    res.show()
    #    assert False, "expected error. objects of different dimensions not supported."
    #except EnvironmentError:
    #    pass

    # LHS: H2OFrame, RHS: scaler
    res = 1.2 + iris[2]
    res2 = iris + res[21]
    res2.show()

    # LHS: H2OFrame, RHS: scaler
    res = iris + 2
    res_rows, res_cols = res.dim()
    assert res_rows == rows and res_cols == cols, "dimension mismatch"
    for x, y in zip([res[c].sum() for c in range(cols-1)], [469.9, 342.6, 266.9, 162.2]):
        assert abs(x - y) < 1e-1,  "expected same values"
df = df.loc[:, ["file", "jrepublican"] + [x for x in pvars1 + pvars2]]

## For randomization check, we want to estimate two models, a benchmark model
## and a full pretreatment model. See paper for details.
models = ["bench", "full"]

## Create dataframes to store ROC curve data, performance data and predictions
roc = pd.DataFrame({"model": [], "algorithm": [], "fpr": [], "tpr": []})
perfs = pd.DataFrame({"model": [], "algorithm": [], "mse_tf_cv": [],
                      "auc_tf_cv": [], "mse_vf": [], "auc_vf": [], "obs_tf": [], "obs_vf": []})
preds = pd.DataFrame({"model": [], "file": [], "outcome": [],
                      "my_rf": [], "my_lasso": [], "my_ols": [], "my_ensemble": []})

try:
    ## Initialize the cluster and convert data into h2o format
    h2o.init(max_mem_size="32G")
    hf = h2o.H2OFrame(df)
    hf["jrepublican"] = hf["jrepublican"].asfactor()

    for model in models:

        ## Create empty dataframes to collect cross-validation holdout
        ## predicions and out of sample predictions
        pr0 = pd.DataFrame(df.loc[:, ["file", "jrepublican"]])
        pr1 = pd.DataFrame(df.loc[:, ["file", "jrepublican"]])
        pr0 = pr0.reset_index().loc[:, ["file", "jrepublican"]]
        pr1 = pr1.reset_index().loc[:, ["file", "jrepublican"]]

        tf = hf
        vf = hf
Ejemplo n.º 48
0
def vec_math_ops(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    sin_cos_tan_atan_sinh_cosh_tanh_asinh_data = [[random.uniform(-10,10) for r in range(10)] for c in range(10)]
    asin_acos_atanh_data = [[random.uniform(-1,1) for r in range(10)] for c in range(10)]
    acosh_data = [[random.uniform(1,10) for r in range(10)] for c in range(10)]
    abs_data = [[random.uniform(-100000,0) for r in range(10)] for c in range(10)]
    zero_one_data = [random.randint(0,1) for c in range(10)]
    zero_one_data = [zero_one_data, zero_one_data]

    h2o_data1 = h2o.H2OFrame(python_obj=sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    h2o_data2 = h2o.H2OFrame(python_obj=asin_acos_atanh_data)
    h2o_data3 = h2o.H2OFrame(python_obj=acosh_data)
    h2o_data4 = h2o.H2OFrame(python_obj=abs_data)
    h2o_data5 = h2o.H2OFrame(python_obj=zero_one_data)

    np_data1 = np.array(sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    np_data2 = np.array(asin_acos_atanh_data)
    np_data3 = np.array(acosh_data)
    np_data4 = np.array(abs_data)
    np_data5 = np.array(zero_one_data)

    row, col = h2o_data1.dim()

    c = random.randint(0,col-1)
    for d in range(1,6):
        h2o_signif = h2o_data5[c].signif(digits=d)
        h2o_round = h2o_data5[c].round(digits=d+4)
        s = h2o_signif[0]
        r = h2o_round[0]
        assert s == r, "Expected these to be equal, but signif: {0}, round: {1}".format(s, r)
    h2o_transposed = h2o_data1[c].transpose()
    x, y = h2o_transposed.dim()
    assert x == 1 and y == 10, "Expected 1 row and 10 columns, but got {0} rows and {1} columns".format(x,y)
    h2o.np_comparison_check(h2o_data1[:,c].cos(), np.cos(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[:,c].sin(), np.sin(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[:,c].tan(), np.tan(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data2[:,c].acos(), np.arccos(np_data2[:,c]), 10)
    h2o.np_comparison_check(h2o_data2[:,c].asin(), np.arcsin(np_data2[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[:,c].atan(), np.arctan(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[:,c].cosh(), np.cosh(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[c].sinh(), np.sinh(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[c].tanh(), np.tanh(np_data1[:,c]), 10)
    h2o.np_comparison_check(h2o_data3[c].acosh(), np.arccosh(np_data3[:,c]), 10)
    h2o.np_comparison_check(h2o_data1[c].asinh(), np.arcsinh(np_data1[:,c]), 10)
    h2o_val = h2o_data3[c].gamma()[5,:]
    num_val = math.gamma(h2o_data3[5,c])
    assert abs(h2o_val - num_val) <  max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and" \
        "math".format(h2o_val,num_val)
    h2o_val = h2o_data3[c].lgamma()[5,:]
    num_val = math.lgamma(h2o_data3[5,c])
    assert abs(h2o_val - num_val) <  max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and " \
        "math".format(h2o_val,num_val)
    h2o_val = h2o_data3[c].digamma()[5,:]._scalar()
    num_val = scipy.special.polygamma(0,h2o_data3[5,c])
    assert abs(h2o_val - num_val) <  max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and " \
        "math".format(h2o_val,num_val)
    h2o_val = h2o_data3[c].trigamma()[5,:]
    num_val = scipy.special.polygamma(1,h2o_data3[5,c])
    assert abs(h2o_val - float(num_val)) <  max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and " \
        "math".format(h2o_val,num_val)
def vec_scaler_comparisons(ip, port):
    # Connect to a pre-existing cluster
    h2o.init(ip, port)

    air = h2o.import_frame(
        path=h2o.locate("smalldata/airlines/allyears2k_headers.zip"))
    rows, cols = air.dim()

    ## H2OVec/scaler
    # ==
    row_sum = 0
    for level in air.levels(16):
        if level == "ANC": continue  # TODO: there's a bug here
        r, c = air[air["Origin"] == str(level)].dim()
        row_sum = row_sum + r
    assert row_sum == rows - 1, "expected equal number of rows"

    # ==, !=
    jan = air[air["Month"] == 1]
    not_jan = air[air["Month"] != 1]
    no_rows, no_cols = not_jan.dim()
    yes_rows, yes_cols = jan.dim()
    assert (
        no_rows + yes_rows
    ) == rows and no_cols == yes_cols == cols, "expected equal number of rows and cols"

    # >, <=
    g = air[air["Year"] > 1990]
    L = air[air["Year"] <= 1990]
    g_rows, g_cols = g.dim()
    L_rows, L_cols = L.dim()
    assert (
        L_rows + g_rows
    ) == rows and L_cols == g_cols == cols, "expected equal number of rows and cols"

    # >=, <
    G = air[air["DayofMonth"] >= 15]
    l = air[air["DayofMonth"] < 15]
    G_rows, G_cols = G.dim()
    l_rows, l_cols = l.dim()
    assert (
        l_rows + G_rows
    ) == rows and l_cols == G_cols == cols, "expected equal number of rows and cols"

    ## scaler/H2OVec
    # ==
    row_sum = 0
    for level in air.levels(16):
        if level == "ANC": continue
        r, c = air[str(level) == air["Origin"]].dim()
        row_sum = row_sum + r
    assert row_sum == rows - 1, "expected equal number of rows"

    # ==, !=
    jan = air[1 == air["Month"]]
    not_jan = air[1 != air["Month"]]
    no_rows, no_cols = not_jan.dim()
    yes_rows, yes_cols = jan.dim()
    assert (
        no_rows + yes_rows
    ) == rows and no_cols == yes_cols == cols, "expected equal number of rows and cols"

    # >, <=
    g = air[1990 <= air["Year"]]
    L = air[1990 > air["Year"]]
    g_rows, g_cols = g.dim()
    L_rows, L_cols = L.dim()
    assert (
        L_rows + g_rows
    ) == rows and L_cols == g_cols == cols, "expected equal number of rows and cols"

    # >=, <
    G = air[15 < air["DayofMonth"]]
    l = air[15 >= air["DayofMonth"]]
    G_rows, G_cols = G.dim()
    l_rows, l_cols = l.dim()
    assert (
        l_rows + G_rows
    ) == rows and l_cols == G_cols == cols, "expected equal number of rows and cols"
    def exec(self):

        log.info('[START] {}'.format("exec"))

        try:

            if (platform.system() == 'Windows'):

                globalVar['inpPath'] = 'E:/DATA/OUTPUT'
                globalVar['outPath'] = 'E:/DATA/OUTPUT'
                globalVar['modelPath'] = 'E:/DATA'

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': '2020-09-01',
                    'endDate': '2021-11-01'

                    # 모델 버전 (날짜)
                    ,
                    'modelVer': '*'
                    # , 'modelVer': '20220220'
                }

            else:

                # 옵션 설정
                sysOpt = {
                    # 시작/종료 시간
                    'srtDate': globalVar['srtDate'],
                    'endDate': globalVar['endDate']

                    # 모델 버전 (날짜)
                    ,
                    'modelVer': '*'
                    # , 'modelVer': '20220220'
                }

            # modelDirKeyList = ['AI_2Y']
            # figActDirKeyList = ['ACT_2Y']
            # figForDirKeyList = ['FOR_2Y']
            #
            # for k, modelDirKey in enumerate(modelDirKeyList):
            #     figActDirKey = figActDirKeyList[k]
            #     figForDirKey = figForDirKeyList[k]

            modelDirKey = 'AI_2Y'
            figActDirKey = 'ACT_2Y'
            figForDirKey = 'FOR_2Y'
            modelVer = sysOpt['modelVer']

            isDlModelInit = False

            # DB 연결 정보
            pymysql.install_as_MySQLdb()

            # 환경 변수 읽기
            config = configparser.ConfigParser()
            config.read(globalVar['sysPath'], encoding='utf-8')
            dbUser = config.get('mariadb', 'user')
            dbPwd = config.get('mariadb', 'pwd')
            dbHost = config.get('mariadb', 'host')
            dbPort = config.get('mariadb', 'port')
            dbName = config.get('mariadb', 'dbName')

            import sqlalchemy
            from sqlalchemy.ext.declarative import declarative_base

            # dbCon = create_engine('mysql://{0}:{1}@{2}:{3}/{4}'.format(dbUser, dbPwd, dbHost, dbPort, dbName))
            dbCon = create_engine('mariadb://{0}:{1}@{2}:{3}/{4}'.format(
                dbUser, dbPwd, dbHost, dbPort, dbName))

            # 관측소 정보
            # inpPosFile = '{}/{}'.format(globalVar['cfgPath'], 'stnInfo/GA_STN_INFO.xlsx')
            # posData = pd.read_excel(inpPosFile)
            # posDataL1 = posData[['id', 'lat', 'lon']]

            res = dbCon.execute("""
                SELECT *
                FROM TB_STN_INFO
                """).fetchall()

            posDataL1 = pd.DataFrame(res).rename(
                {
                    'ID': 'id',
                    'dtDateKst': 'DATE_TIME_KST',
                    'LAT': 'lat',
                    'LON': 'lon'
                },
                axis='columns')

            lat1D = np.array(posDataL1['lat'])
            lon1D = np.array(posDataL1['lon'])

            # *******************************************************
            # UM 자료 읽기
            # *******************************************************
            dtSrtDate = pd.to_datetime(sysOpt['srtDate'], format='%Y-%m-%d')
            dtEndDate = pd.to_datetime(sysOpt['endDate'], format='%Y-%m-%d')
            dtIncDateList = pd.date_range(start=dtSrtDate,
                                          end=dtEndDate,
                                          freq=Day(1))

            # posLon = posInfo['lon']
            # posLat = posInfo['lat']
            # lon1D = np.array(posLon).reshape(1)
            # lat1D = np.array(posLat).reshape(1)

            cfgFile = '{}/{}'.format(
                globalVar['cfgPath'],
                'modelInfo/UMKR_l015_unis_H000_202110010000.grb2')
            # log.info("[CHECK] cfgFile : {}".format(cfgFile))

            cfgInfo = pygrib.open(cfgFile).select(name='Temperature')[1]
            lat2D, lon2D = cfgInfo.latlons()

            # =======================================================================
            # 최근접 좌표
            # =======================================================================
            posList = []

            # kdTree를 위한 초기 데이터
            for i in range(0, lon2D.shape[0]):
                for j in range(0, lon2D.shape[1]):
                    coord = [lat2D[i, j], lon2D[i, j]]
                    posList.append(cartesian(*coord))

            tree = spatial.KDTree(posList)

            # coord = cartesian(posInfo['lat'], posInfo['lon'])
            row1D = []
            col1D = []
            for ii, posInfo in posDataL1.iterrows():
                coord = cartesian(posInfo['lat'], posInfo['lon'])
                closest = tree.query([coord], k=1)
                cloIdx = closest[1][0]
                row = int(cloIdx / lon2D.shape[1])
                col = cloIdx % lon2D.shape[1]

                row1D.append(row)
                col1D.append(col)

            row2D, col2D = np.meshgrid(row1D, col1D)

            # dtIncDateInfo = dtIncDateList[0]
            dsDataL2 = xr.Dataset()
            for ii, dtIncDateInfo in enumerate(dtIncDateList):
                log.info("[CHECK] dtIncDateInfo : {}".format(dtIncDateInfo))

                # UMKR_l015_unis_H001_202110010000.grb2
                # saveFile = '{}/TEST/MODEL/UMKR_l015_unis_{}_{}.nc'.format(globalVar['outPath'], pd.to_datetime(dtSrtDate).strftime('%Y%m%d'), pd.to_datetime(dtEndDate).strftime('%Y%m%d'))

                # if (os.path.exists(saveFile)):
                #     continue

                dtDateYm = dtIncDateInfo.strftime('%Y%m')
                dtDateDay = dtIncDateInfo.strftime('%d')
                dtDateHour = dtIncDateInfo.strftime('%H')
                dtDateYmd = dtIncDateInfo.strftime('%Y%m%d')
                dtDateHm = dtIncDateInfo.strftime('%H%M')
                dtDateYmdHm = dtIncDateInfo.strftime('%Y%m%d%H%M')

                # UMKR_l015_unis_H001_202110010000.grb2
                inpFilePattern = 'MODEL/{}/{}/{}/UMKR_l015_unis_*_{}.grb2'.format(
                    dtDateYm, dtDateDay, dtDateHour, dtDateYmdHm)
                inpFile = '{}/{}'.format(globalVar['inpPath'], inpFilePattern)
                fileList = sorted(glob.glob(inpFile))

                if (len(fileList) < 1): continue
                # raise Exception("[ERROR] fileInfo : {} : {}".format("입력 자료를 확인해주세요.", inpFile))

                # fileInfo = fileList[2]
                for jj, fileInfo in enumerate(fileList):
                    log.info("[CHECK] fileInfo : {}".format(fileInfo))

                    try:
                        grb = pygrib.open(fileInfo)
                        grbInfo = grb.select(name='Temperature')[1]

                        validIdx = int(
                            re.findall('H\d{3}', fileInfo)[0].replace('H', ''))
                        dtValidDate = grbInfo.validDate
                        dtAnalDate = grbInfo.analDate

                        uVec = grb.select(
                            name='10 metre U wind component')[0].values[row2D,
                                                                        col2D]
                        vVec = grb.select(
                            name='10 metre V wind component')[0].values[row2D,
                                                                        col2D]
                        WD = (270 - np.rad2deg(np.arctan2(vVec, uVec))) % 360
                        WS = np.sqrt(np.square(uVec) + np.square(vVec))
                        PA = grb.select(
                            name='Surface pressure')[0].values[row2D, col2D]
                        TA = grbInfo.values[row2D, col2D]
                        TD = grb.select(
                            name='Dew point temperature')[0].values[row2D,
                                                                    col2D]
                        HM = grb.select(
                            name='Relative humidity')[0].values[row2D, col2D]
                        lowCA = grb.select(
                            name='Low cloud cover')[0].values[row2D, col2D]
                        medCA = grb.select(
                            name='Medium cloud cover')[0].values[row2D, col2D]
                        higCA = grb.select(
                            name='High cloud cover')[0].values[row2D, col2D]
                        CA_TOT = np.mean([lowCA, medCA, higCA], axis=0)
                        SS = grb.select(name='unknown')[0].values[row2D, col2D]

                        dsDataL1 = xr.Dataset(
                            {
                                'uVec':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (uVec).reshape(1, 1, len(lat1D), len(lon1D))),
                                'vVec':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (vVec).reshape(1, 1, len(lat1D), len(lon1D))),
                                'WD':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (WD).reshape(1, 1, len(lat1D), len(lon1D))),
                                'WS':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (WS).reshape(1, 1, len(lat1D), len(lon1D))),
                                'PA':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (PA).reshape(1, 1, len(lat1D), len(lon1D))),
                                'TA':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (TA).reshape(1, 1, len(lat1D), len(lon1D))),
                                'TD':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (TD).reshape(1, 1, len(lat1D), len(lon1D))),
                                'HM':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (HM).reshape(1, 1, len(lat1D), len(lon1D))),
                                'lowCA': (('anaTime', 'time', 'lat', 'lon'),
                                          (lowCA).reshape(
                                              1, 1, len(lat1D), len(lon1D))),
                                'medCA': (('anaTime', 'time', 'lat', 'lon'),
                                          (medCA).reshape(
                                              1, 1, len(lat1D), len(lon1D))),
                                'higCA': (('anaTime', 'time', 'lat', 'lon'),
                                          (higCA).reshape(
                                              1, 1, len(lat1D), len(lon1D))),
                                'CA_TOT': (('anaTime', 'time', 'lat', 'lon'),
                                           (CA_TOT).reshape(
                                               1, 1, len(lat1D), len(lon1D))),
                                'SS':
                                (('anaTime', 'time', 'lat', 'lon'),
                                 (SS).reshape(1, 1, len(lat1D), len(lon1D)))
                            },
                            coords={
                                'anaTime': pd.date_range(dtAnalDate,
                                                         periods=1),
                                'time': pd.date_range(dtValidDate, periods=1),
                                'lat': lat1D,
                                'lon': lon1D
                            })

                    except Exception as e:
                        log.error("Exception : {}".format(e))

                    for kk, posInfo in posDataL1.iterrows():
                        posId = int(posInfo['id'])
                        posLat = posInfo['lat']
                        posLon = posInfo['lon']

                        log.info(
                            "[CHECK] posId (posLon, posLat) : {} ({}. {})".
                            format(posId, posLon, posLat))

                        # umData = dsDataL2
                        umData = dsDataL1
                        dtAnaTimeList = umData['anaTime'].values
                        # umDataL8 = pd.DataFrame()
                        for ll, dtAnaTimeInfo in enumerate(dtAnaTimeList):
                            log.info("[CHECK] dtAnaTimeInfo : {}".format(
                                dtAnaTimeInfo))

                            try:
                                umDataL2 = umData.sel(lat=posLat,
                                                      lon=posLon,
                                                      anaTime=dtAnaTimeInfo)
                                umDataL3 = umDataL2.to_dataframe().dropna(
                                ).reset_index(drop=True)
                                # umDataL3['dtDate'] = pd.to_datetime(dtAnaTimeInfo) + (umDataL3.index.values * datetime.timedelta(hours=1))
                                umDataL3['dtDate'] = pd.to_datetime(
                                    dtAnaTimeInfo) + (
                                        validIdx * datetime.timedelta(hours=1))
                                # umDataL3['dtDateKst'] = umDataL3.index.tz_localize(tzUtc).tz_convert(tzKst)
                                umDataL3[
                                    'dtDateKst'] = umDataL3['dtDate'] + dtKst
                                umDataL4 = umDataL3.rename({'SS': 'SWR'},
                                                           axis='columns')
                                umDataL5 = umDataL4[[
                                    'dtDateKst', 'dtDate', 'CA_TOT', 'HM',
                                    'PA', 'TA', 'TD', 'WD', 'WS', 'SWR'
                                ]]
                                umDataL5['SRV'] = 'SRV{:05d}'.format(posId)
                                umDataL5['TA'] = umDataL5['TA'] - 273.15
                                umDataL5['TD'] = umDataL5['TD'] - 273.15
                                umDataL5['PA'] = umDataL5['PA'] / 100.0
                                umDataL5['CA_TOT'] = np.where(
                                    umDataL5['CA_TOT'] < 0, 0,
                                    umDataL5['CA_TOT'])
                                umDataL5['CA_TOT'] = np.where(
                                    umDataL5['CA_TOT'] > 1, 1,
                                    umDataL5['CA_TOT'])

                                umDataL6 = umDataL5
                                for i in umDataL6.index:
                                    lat = posLat
                                    lon = posLon
                                    pa = umDataL6._get_value(i, 'PA') * 100.0
                                    ta = umDataL6._get_value(i, 'TA')
                                    # dtDateTime = umDataL6._get_value(i, 'dtDateKst')
                                    dtDateTime = umDataL6._get_value(
                                        i, 'dtDate')

                                    solPosInfo = pvlib.solarposition.get_solarposition(
                                        dtDateTime,
                                        lat,
                                        lon,
                                        pressure=pa,
                                        temperature=ta,
                                        method='nrel_numpy')
                                    umDataL6._set_value(
                                        i, 'sza', solPosInfo['zenith'].values)
                                    umDataL6._set_value(
                                        i, 'aza', solPosInfo['azimuth'].values)
                                    umDataL6._set_value(
                                        i, 'et',
                                        solPosInfo['equation_of_time'].values)

                                # umDataL7 = umDataL6.merge(pvDataL2, how='left', left_on=['dtDateKst'], right_on=['dtDateKst'])
                                umDataL7 = umDataL6
                                umDataL7['anaTime'] = pd.to_datetime(
                                    dtAnaTimeInfo)

                                # umDataL8 = umDataL8.append(umDataL7)

                            except Exception as e:
                                log.error("Exception : {}".format(e))

                        # log.info("[CHECK] modelDirKey : {}".format(modelDirKey))
                        # log.info("[CHECK] figActDirKey : {}".format(figActDirKey))

                        # *******************************************************
                        # 관측자료 읽기
                        # *******************************************************
                        # inpData = pd.read_excel(fileInfo, engine='openpyxl')
                        # inpData = umDataL7
                        inpData = umDataL7
                        inpDataL1 = inpData.rename({'dtDate_x': 'dtDate'},
                                                   axis='columns')
                        # log.info("[CHECK] inpDataL1 : {}".format(inpDataL1))

                        # log.info("[CHECK] inpDataL1['SRV'] : {}".format(inpDataL1['SRV'][0]))
                        # log.info("[CHECK] inpDataL1['anaTime'] : {}".format(inpDataL1['anaTime'][0]))
                        # log.info("[CHECK] inpDataL1['dtDate'] : {}".format(inpDataL1['dtDate'][0]))

                        iAnaYear = int(inpDataL1['anaTime'][0].strftime("%Y"))

                        # 테이블 없을 시 생성
                        dbCon.execute(
                            """
                            create table IF NOT EXISTS TB_FOR_DATA_%s
                            (
                                SRV           varchar(10) not null comment '관측소 정보',
                                ANA_DATE      date        not null comment '예보일',
                                DATE_TIME     datetime    not null comment '예보날짜 UTC',
                                DATE_TIME_KST datetime    null comment '예보날짜 KST',
                                CA_TOT        float       null comment '전운량',
                                HM            float       null comment '상대습도',
                                PA            float       null comment '현지기압',
                                TA            float       null comment '기온',
                                TD            float       null comment '이슬점온도',
                                WD            float       null comment '풍향',
                                WS            float       null comment '풍속',
                                SZA           float       null comment '태양 천정각',
                                AZA           float       null comment '태양 방위각',
                                ET            float       null comment '태양 시간각',
                                SWR           float       null comment '일사량',
                                ML            float       null comment '머신러닝',
                                DL            float       null comment '딥러닝',
                                REG_DATE      datetime    null comment '등록일',
                                MOD_DATE      datetime    null comment '수정일',
                                primary key (SRV, DATE_TIME, ANA_DATE)
                            )    
                                comment '기상 예보 테이블_%s';
                            """, (iAnaYear, iAnaYear))

                        keyChk = dbCon.execute(
                            """
                            SELECT COUNT(*) AS CNT
                            FROM TB_FOR_DATA_%s
                            WHERE  SRV = %s AND ANA_DATE = %s AND DATE_TIME = %s
                            """, (iAnaYear, inpDataL1['SRV'][0],
                                  inpDataL1['anaTime'][0],
                                  inpDataL1['dtDate'][0])).fetchone()

                        # log.info("[CHECK] keyChk['CNT'] : {}".format(keyChk['CNT']))

                        if (keyChk['CNT'] > 0): continue

                        # **********************************************************************************************************
                        # 머신러닝
                        # **********************************************************************************************************
                        # saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(globalVar['modelPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', '*')
                        # saveMlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model.pkl'.format(globalVar['modelPath'], modelDirKey, serviceName, posId, 'final', 'pycaret', 'for', '20220220')
                        # saveMlModelList = sorted(glob.glob(saveMlModel), reverse=True)
                        #
                        # # from pycaret.regression import *
                        #
                        # if (len(saveMlModelList) > 0):
                        #     saveMlModelInfo = saveMlModelList[0]
                        #     log.info("[CHECK] saveMlModelInfo : {}".format(saveMlModelInfo))
                        #
                        #     mlModel = load_model(os.path.splitext(saveMlModelInfo)[0])
                        #
                        # mlModelPred = predict_model(mlModel, data=inpDataL1).rename({'Label': 'ML'}, axis='columns')[['dtDateKst', 'anaTime', 'ML']]

                        # **********************************************************************************************************
                        # 딥러닝
                        # **********************************************************************************************************
                        # saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(globalVar['modelPath'], modelDirKey, serviceName, posId, 'final', 'h2o', 'for', '*')
                        saveDlModel = '{}/{}/{}-SRV{:05d}-{}-{}-{}-{}.model'.format(
                            globalVar['modelPath'], modelDirKey, serviceName,
                            posId, 'final', 'h2o', 'for', modelVer)
                        saveDlModelList = sorted(glob.glob(saveDlModel),
                                                 reverse=True)

                        if (isDlModelInit == False):
                            h2o.init()
                            isDlModelInit = True

                        # 학습 모델 불러오기
                        if (len(saveDlModelList) > 0):
                            saveDlModelInfo = saveDlModelList[0]
                            log.info("[CHECK] saveDlModelInfo : {}".format(
                                saveDlModelInfo))

                            dlModel = h2o.load_model(path=saveDlModelInfo)

                        tmpData = inpDataL1[[
                            'dtDateKst', 'anaTime', 'CA_TOT', 'HM', 'PA', 'TA',
                            'TD', 'WD', 'WS', 'SWR', 'sza', 'aza', 'et'
                        ]].dropna().reset_index(drop=True)
                        dlModelPred = dlModel.predict(
                            h2o.H2OFrame(tmpData)).as_data_frame().rename(
                                {'predict': 'DL'}, axis='columns')
                        dlModelPredL1 = pd.concat(
                            [tmpData[['dtDateKst', 'anaTime']], dlModelPred],
                            axis=1)

                        # 머신러닝 또는 딥러닝
                        # inpDataL2 = inpDataL1.merge(mlModelPred, how='left', left_on=['dtDateKst', 'anaTime'],right_on=['dtDateKst', 'anaTime'])\
                        #     .merge(dlModelPredL1, how='left', left_on=['dtDateKst', 'anaTime'], right_on=['dtDateKst', 'anaTime'])

                        # 딥러닝
                        inpDataL2 = inpDataL1.merge(
                            dlModelPredL1,
                            how='left',
                            left_on=['dtDateKst', 'anaTime'],
                            right_on=['dtDateKst', 'anaTime'])

                        # dtDateKst 및 anaTime을 기준으로 중복 제거
                        inpDataL2.drop_duplicates(
                            subset=['dtDateKst', 'anaTime'], inplace=True)
                        inpDataL2 = inpDataL2.reset_index(drop=True)

                        dbData = inpDataL2.rename(
                            {
                                'anaTime': 'ANA_DATE',
                                'dtDateKst': 'DATE_TIME_KST',
                                'dtDate': 'DATE_TIME',
                                'sza': 'SZA',
                                'aza': 'AZA',
                                'et': 'ET'
                            },
                            axis='columns')

                        res = dbCon.execute(
                            """
                            SELECT COUNT(*) AS CNT
                            FROM TB_FOR_DATA_%s
                            WHERE  SRV = %s AND ANA_DATE = %s AND DATE_TIME = %s
                            """,
                            (iAnaYear, dbData['SRV'][0], dbData['ANA_DATE'][0],
                             dbData['DATE_TIME'][0])).fetchone()

                        log.info("[CHECK] res['CNT'] : {}".format(res['CNT']))

                        # 삽입 및 수정
                        if (res['CNT'] == 0):
                            dbData['REG_DATE'] = datetime.datetime.now()
                        else:
                            dbData['MOD_DATE'] = datetime.datetime.now()

                        # 삽입
                        selDbTable = 'TB_FOR_DATA_{}'.format(iAnaYear)
                        dbData.to_sql(name=selDbTable,
                                      con=dbCon,
                                      if_exists='append',
                                      index=False)

        except Exception as e:
            log.error("Exception : {}".format(e))
            raise e

        finally:
            log.info('[END] {}'.format("exec"))
# Trains and saves a Random Forest Model to local directory

import pandas as pd
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

# Set Root Directory
directory = '/Users/liamroberts/Desktop/Datasets/Rossmann/'
save_folder = '/Users/LiamRoberts/rossmann_retail/models'

# Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Initialize h2o cluster
h2o.init(nthreads=-1, max_mem_size='6G', enable_assertions=False)
h2o.remove_all()

# Convert DataFrames to h2o frames
train = h2o.H2OFrame(python_obj=train)
test = h2o.H2OFrame(python_obj=test)

# Encode Categorical Variables
categorical = ['Store', 'DayOfWeek', 'Month', 'WeekOfYear']

for label in categorical:
    train[label] = train[label].asfactor()
    test[label] = test[label].asfactor()

# Log transform sales
train['log_sales'] = train['Sales'].log()
Ejemplo n.º 52
0
def init_server_connection(host_addr, port):
    h2o.init(ip=host_addr, port=port)
Ejemplo n.º 53
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None:
                sample_weight_eval_set = [
                    (sample_weight_eval_set[0] != 0).astype(int)
                ]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(),
                                   column_types=self.col_types)
            valid_y = h2o.H2OFrame(
                eval_set[0][1],
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs')
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**params)

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            # Models that can use an offset column
            if isinstance(model, H2OGBMModel) | isinstance(
                    model, H2ODLModel) | isinstance(model, H2OGLMModel):
                model.train(x=cols_to_train,
                            y=self.target,
                            training_frame=train_frame,
                            offset_column=offset_col,
                            **train_kwargs)
            else:
                model.train(x=train_X.names,
                            y=self.target,
                            training_frame=train_frame,
                            **train_kwargs)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(),
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Ejemplo n.º 54
0
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(lat=40.7, lon=-73.9),
        pitch=0,
        zoom=8.5,
        style=mapbox_style,
    ),
    xaxis=dict(domain=[0.6, 1]),
)

fig = dict(data=data, layout=layout)

iplot(fig)

# In[1]:

# Multivariate analysis & Modeling part
## Import library for modeling part
# Library for spliting data into training and testing dataset
from sklearn.model_selection import train_test_split

# Library for h2o cloud
import h2o
h2o.remove_all  # clean slate, in case cluster was already running
h2o.init(max_mem_size="16g")

# Library for doiung
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
Ejemplo n.º 55
0
    def _cluster(request):
        """
        Summarize two parameters row wise. Tensor function.
        :param request: an iterable sequence of RowData
        :return: the same iterable sequence of row data as received
        """
        # Iterate over bundled rows
        for request_rows in request:
            response_rows = []
            params2 = []
            # Iterating over rows
            for row in request_rows.rows:
                # Retrieve the numerical value of the parameters
                # Two columns are sent from the client, hence the length of params will be 2
                params = [d.numData for d in row.duals]
                params2.append(params)

                #logging.info('Params2.{}'.format([d.numData for d in row.duals]))

                # Sum over each row
                #result = sum(params)

                # Create an iterable of Dual with a numerical value
                #duals = iter([SSE.Dual(numData=result)])

                # Append the row data constructed to response_rows
                #response_rows.append(SSE.Row(duals=duals))

            h2o.init()

            iris = h2o.H2OFrame(params2)

            logging.info('Params2.Edo.{}'.format(params2))

            clustersk = int(params2[0][4])
            logging.info('clustersk {}'.format(clustersk))

            results = H2OKMeansEstimator(k=clustersk,
                                         init="Random",
                                         seed=2,
                                         standardize=True)
            results.train(training_frame=iris)

            predicted = results.predict(iris)

            predicted_as_list = h2o.as_list(predicted, use_pandas=False)
            predicted_as_list.pop(0)

            logging.info('Params2.Edo.{}'.format(predicted))

            for result in predicted_as_list:
                # Create an iterable of Dual with a numerical value
                duals = iter([SSE.Dual(numData=int(result[0]))])
                # Append the row data constructed to response_rows
                response_rows.append(SSE.Row(duals=duals))

            results.model_id = "kmeans_iris"
            model_path = h2o.save_model(model=results, force=True)
            print(model_path)

            # Yield Row data as Bundled rows
            yield SSE.BundledRows(rows=response_rows)
Ejemplo n.º 56
0
data_0 = full_data.copy(deep=True)
data_0_columns = data_0.columns.tolist()

scaler = MinMaxScaler(feature_range=(0, 1))
data_0 = scaler.fit_transform(data_0)

#Data Prep
#for column_name in data_0.columns:
#    if max(data_0[column_name]) > 1:
#        scaler = MinMaxScaler(feature_range=(0,1))
#        data_0[column_name] = scaler.fit_transform(data_0[column_name])

############################################################################
#Train Test Split
h2o.init(nthreads=-1, max_mem_size=6)

data_0_h2o = h2o.H2OFrame(data_0)
data_0_h2o.columns = data_0_columns
data_0_h2o.shape
data_0_h2o_shape = data_0_h2o.shape

#data = data.drop(["Unnamed: 0"],axis=1)

data_0_h2o['Renewed'] = data_0_h2o['Renewed'].asfactor(
)  #encode the binary repsonse as a factor
data_0_h2o['Renewed'].levels()

splits = data_0_h2o.split_frame(ratios=[0.65, 0.00], seed=1)

train = splits[0]
 def setUpClass(cls):
     h2o.init(java_heap_GB=10)
Ejemplo n.º 58
0
 def setUpClass(cls):
     # assume we're at 0xdata with it's hdfs namenode
     h2o.init(1,
              use_hdfs=True,
              hdfs_version='cdh4',
              hdfs_name_node='mr-0x6')
Ejemplo n.º 59
0
 def setUpClass(cls):
     global SEED
     SEED = h2o.setup_random_seed()
     h2o.init(2, java_heap_MB=1300, use_flatfile=True)
Ejemplo n.º 60
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

        # TODO: Adapt paths accordingly
        fname_train_csv = "tmp/landsat_train_small_%lu.csv" % train_size
        fname_test_csv = "tmp/landsat_test.csv"

    traingen.to_csv(fname_train_csv, cache=False, remove=True)
    testgen.to_csv(fname_test_csv, cache=False, remove=True)

    import h2o
    from skutil.h2o import h2o_col_to_numpy
    h2o.init(max_mem_size="12G", nthreads=param['n_jobs'])
    h2o.remove_all()
    from h2o.estimators.random_forest import H2ORandomForestEstimator

    if dkey == "landsat_small" or dkey == "landsat":
        train_df = h2o.import_file(fname_train_csv)
        test_df = h2o.import_file(fname_test_csv)
        Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1]
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % train_df.shape[0])
    print("Number of test patterns:\t%i" % test_df.shape[0])
    print("Dimensionality of the data:\t%i\n" % train_df.shape[1])

    if param['max_features'] is None:
        mtries = train_df.shape[1] - 2
    elif param['max_features'] == "sqrt":
        mtries = int(math.sqrt(train_df.shape[1] - 2))

    if param['bootstrap'] == False:
        sample_rate = 1.0
    else:
        sample_rate = 0.632

    model = H2ORandomForestEstimator(
        mtries=mtries,
        sample_rate=sample_rate,
        #nbins=1000, #crash
        min_rows=1,
        build_tree_one_node=True,
        max_depth=20,
        balance_classes=False,
        ntrees=param['n_estimators'],
        seed=seed)

    # training
    fit_start_time = time.time()
    model.train(Xcols, ycol, training_frame=train_df)
    fit_end_time = time.time()

    # testing
    test_start_time = time.time()
    ypreds_test = model.predict(test_df)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(numpy.rint(ypreds_test.as_data_frame().values),
             test_df[ycol].as_data_frame().values, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)