Example #1
0
def concat():
    df1 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=1)
    df2 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=2)
    df3 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=3)

    print(df1)
    print(df2)
    print(df3)

    #Frame to Frame concat (column-wise)
    df123 = df1.concat([df2,df3])
    rows, cols = df123.dim
    print(rows,cols)
    print(df123)
    assert rows == 10000 and cols == 30, "unexpected dimensions in column concatenation for a Frame"

    #Frame to Frame concat (row wise)
    df123_row = df1.concat([df2,df3], axis = 0)
    rows2, cols2 = df123_row.dim
    print(rows2,cols2)
    print(df123_row)
    assert rows2 == 30000 and cols2 == 10, "unexpected dimensions in row concatenation for a Frame"

    #Frame to Vec concat (column wise)
    yy = df2[0]
    zz = df3[0]
    hdf = df1.concat([yy,zz])
    rows3, cols3 = hdf.dim
    print(rows3,cols3)
    print(hdf)
    assert rows3 == 10000 and cols3 == 12, "unexpected dimensions in Frame to Vec concatenation"

    #Vec to Vec concat (column wise)
    xx = df1[0]
    yy = df2[0]
    zz = df3[0]
    hdf2 = xx.concat([yy,zz])
    rows4, cols4 = hdf2.dim
    print(rows4,cols4)
    print(hdf2)
    assert rows4 == 10000 and cols4 == 3, "unexpected dimensions in Vec to Vec concatenation"

    #Frame to Vec concat (row wise)
    yy = df2[0,:]
    zz = df3[0,:]
    hdf3 = df1.concat([yy,zz],axis=0)
    rows5, cols5 = hdf3.dim
    print(rows5,cols5)
    print(hdf3)
    assert rows5 == 10002 and cols5 == 10, "unexpected dimensions in Frame to Vec concatenation"

    #Vec to Vec concat (row wise)
    xx = df1[0,:]
    yy = df2[0,:]
    zz = df3[0,:]
    hdf4 = xx.concat([yy,zz],axis=0)
    rows6, cols6 = hdf4.dim
    print(rows6,cols6)
    print(hdf4)
    assert rows6 == 3 and cols6 == 10, "unexpected dimensions in Vec to Vec concatenation"
Example #2
0
def check_big_merge():
    h2o.remove_all()
    nrow = 1000000
    ncol = 2
    iRange = 100000
    frame1 = h2o.create_frame(rows=nrow,
                              cols=ncol,
                              integer_fraction=1,
                              seed=12345,
                              integer_range=iRange,
                              missing_fraction=0.0)
    frame2 = h2o.create_frame(rows=nrow,
                              cols=ncol,
                              integer_fraction=1,
                              seed=54321,
                              integer_range=iRange,
                              missing_fraction=0.0)

    frame1.set_names(["C1", "C2"])
    frame2.set_names(["C1", "C3"])

    mergedExact = frame1.merge(frame2,
                               by_x=["C1"],
                               by_y=["C1"],
                               all_x=False,
                               all_y=False)
    mergedLeft = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=True)

    assert mergedExact.nrow < mergedLeft.nrow, "Expected row numbers are wrong"
def pubdev_6304():
    fractions = dict()
    fractions["real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = 1
    fractions["integer_fraction"] = 0
    fractions["time_fraction"] = 0
    fractions["string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0
    
    # this used to get an error message
    try: 
        traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=9999999, seed=12345, **fractions)
    except Exception as ex:
        sys.exit(1)

    # this get an error message
    try:
        traindata = h2o.create_frame(rows=100, cols=2, missing_fraction=0, has_response=False, factors=19999999, seed=12345, **fractions)
        sys.exit(1) # should have thrown an error
    except Exception as ex: # expect an error here
        print(ex)
        if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg:
            sys.exit(0) # correct error message
        else:
            sys.exit(1) # something else is wrong.
Example #4
0
def test_parser_svmlight_column_skip():
    # generate a big frame with all datatypes and save it to svmlight
    nrow = 10000
    ncol = 50
    seed = 12345

    f1 = h2o.create_frame(rows=nrow,
                          cols=ncol,
                          real_fraction=0.5,
                          integer_fraction=0.5,
                          missing_fraction=0.2,
                          has_response=False,
                          seed=seed)
    f2 = h2o.create_frame(rows=nrow,
                          cols=1,
                          real_fraction=1,
                          integer_fraction=0,
                          missing_fraction=0,
                          has_response=False,
                          seed=seed)
    f2.set_name(0, "target")
    f1 = f2.cbind(f1)

    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'out.svm')
    pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath,
                                           f1)  # write h2o frame to svm format

    skip_all = list(range(ncol))
    skip_even = list(range(0, ncol, 2))

    try:
        loadFileSkipAll = h2o.upload_file(savefilenamewithpath,
                                          skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath,
                                            skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipSome = h2o.import_file(savefilenamewithpath,
                                             skipped_columns=skip_even)
        sys.exit(1)  # should have failed here
    except:
        pass

    # check for correct parsing only
    checkCorrectSkips(savefilenamewithpath, f1)
def word2vec_export():
    print("###### WORD2VEC ######")
    words = h2o.create_frame(rows=1000, cols=1, string_fraction=1.0, missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000, cols=100, real_fraction=1.0, missing_fraction=0.0)
    frame = words.cbind(embeddings)
    model = H2OWord2vecEstimator(pre_trained=frame)
    model.train(training_frame=frame)
    expect_error(model.download_pojo, model="Word2Vec", format="POJO")
    model.download_mojo(path=RESULT_DIR)
def word2vec_export():
    print("###### WORD2VEC ######")
    words = h2o.create_frame(rows=1000, cols=1, string_fraction=1.0, missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000, cols=100, real_fraction=1.0, missing_fraction=0.0)
    frame = words.cbind(embeddings)
    model = H2OWord2vecEstimator(pre_trained=frame)
    model.train(training_frame=frame)
    expect_error(model.download_pojo, model="Word2Vec", format="POJO")
    model.download_mojo(path=RESULT_DIR)
Example #7
0
def random_dataset(nrow,
                   ncol,
                   realFrac=0.4,
                   intFrac=0.3,
                   enumFrac=0.3,
                   factorR=10,
                   integerR=100,
                   responseFactor=1,
                   misFrac=0.01,
                   randSeed=None):
    fractions = dict()
    if (ncol == 1) and (realFrac >= 1.0):
        fractions[
            "real_fraction"] = 1  # Right now we are dropping string columns, so no point in having them.
        fractions["categorical_fraction"] = 0
        fractions["integer_fraction"] = 0
        fractions["time_fraction"] = 0
        fractions[
            "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
        fractions["binary_fraction"] = 0

        return h2o.create_frame(rows=nrow,
                                cols=ncol,
                                missing_fraction=misFrac,
                                has_response=True,
                                response_factors=responseFactor,
                                integer_range=integerR,
                                seed=randSeed,
                                **fractions)

    real_part = pyunit_utils.random_dataset_real_only(nrow,
                                                      (int)(realFrac * ncol),
                                                      misFrac=misFrac,
                                                      randSeed=randSeed)
    cnames = ['c_' + str(ind) for ind in range(real_part.ncol)]
    real_part.set_names(cnames)
    enumFrac = enumFrac + (1 - realFrac) / 2
    intFrac = 1 - enumFrac
    fractions[
        "real_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = enumFrac
    fractions["integer_fraction"] = intFrac
    fractions["time_fraction"] = 0
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0

    df = h2o.create_frame(rows=nrow,
                          cols=(ncol - real_part.ncol),
                          missing_fraction=misFrac,
                          has_response=True,
                          response_factors=responseFactor,
                          integer_range=integerR,
                          seed=randSeed,
                          **fractions)
    return real_part.cbind(df)
Example #8
0
def word2vec_get_model():
    print("Test retrieving a word2vec model by a key")

    words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings)
    w2v_model.train(training_frame=word_embeddings)

    model_id = w2v_model.model_id
    model = h2o.get_model(model_id)

    assert model, "Model was retrived"
def word2vec_to_frame():
    print("Test converting a word2vec model to a Frame")

    words = h2o.create_frame(rows=1000,cols=1,string_fraction=1.0,missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=1000,cols=100,real_fraction=1.0,missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator(pre_trained=word_embeddings)
    w2v_model.train(training_frame=word_embeddings)

    w2v_frame = w2v_model.to_frame()

    word_embeddings.names = w2v_frame.names
    assert word_embeddings.as_data_frame().equals(word_embeddings.as_data_frame()), "Source and generated embeddings match"
Example #10
0
def create_frame_test():
    """Test `h2o.create_frame()`."""
    for _ in range(10):
        r = random.randint(1, 1000)
        c = random.randint(1, 1000)

        frame = h2o.create_frame(rows=r, cols=c)
        assert frame.nrow == r and frame.ncol == c, \
            "Expected {0} rows and {1} cols, but got {2} rows and {3} cols.".format(r, c, frame.nrow, frame.ncol)

    def assert_coltypes(frame, freal, fenum, fint, fbin, ftime, fstring):
        # The server does not report columns as binary -- instead they are integer.
        fint += fbin
        fbin = 0
        type_counts = defaultdict(int)
        for ft in viewvalues(frame.types):
            type_counts[ft] += 1
        print("Created table with column counts: {%s}" % ", ".join("%s: %d" % t for t in type_counts.items()))
        for ct in ["real", "enum", "int", "time", "string"]:
            assert abs(type_counts[ct] - locals()["f" + ct] * frame.ncol) < 1, \
                "Wrong column count of type %s: %d" % (ct, type_counts[ct])

    f1 = h2o.create_frame(rows=10, cols=1000, real_fraction=1)
    assert_coltypes(f1, 1, 0, 0, 0, 0, 0)

    f2 = h2o.create_frame(rows=10, cols=1000, binary_fraction=0.5, time_fraction=0.5)
    assert_coltypes(f2, 0, 0, 0, 0.5, 0.5, 0)

    f3 = h2o.create_frame(rows=10, cols=1000, string_fraction=0.2, time_fraction=0.8)
    assert_coltypes(f3, 0, 0, 0, 0, 0.8, 0.2)

    f4 = h2o.create_frame(rows=10, cols=1000, real_fraction=0.9)
    assert_coltypes(f4, 0.9, 0.04, 0.04, 0.02, 0, 0)

    f5 = h2o.create_frame(rows=2, cols=1000, integer_fraction=0.75000000000001, string_fraction=0.25000000000001)
    assert_coltypes(f5, 0, 0, 0.75, 0, 0, 0.25)

    try:
        h2o.create_frame(rows=10, cols=1000, real_fraction=0.1, categorical_fraction=0.1, integer_fraction=0.1,
                         binary_fraction=0.1, time_fraction=0.1, string_fraction=0.1)
        assert False, "The data frame should not have been created!"
    except H2OValueError:
        pass

    try:
        h2o.create_frame(rows=10, cols=1000, real_fraction=0.5, categorical_fraction=0.5, integer_fraction=0.1)
        assert False, "The data frame should not have been created!"
    except H2OValueError:
        pass
Example #11
0
def test_transform():
    valid_values = ["none", "standardize", "normalize", "demean", "descale"]
    df = h2o.create_frame(rows=100,
                          cols=4,
                          categorical_fraction=0.4,
                          integer_fraction=0,
                          binary_fraction=0,
                          real_range=100,
                          integer_range=100,
                          missing_fraction=0,
                          seed=1234)
    model = H2OAggregatorEstimator(target_num_exemplars=5)
    try:
        for val in valid_values:
            model.transform = val
            model.train(training_frame=df)
    except:
        assert False, "Aggregator model should be able to process all valid transform values"

    # Try with invalid value
    try:
        model = H2OAggregatorEstimator(target_num_exemplars=5,
                                       transform="some_invalid_value")
        assert False, "Passing invalid value of transform should throw an error"
    except:
        pass
def test_parser_svmlight_column_skip_not_supported():
    print("Test that functions calling fail if skipped_columns is passed with svm file.")
    # generate a frame
    nrow = 10
    ncol = 10
    seed = 12345

    f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=0.5, integer_fraction=0.5, missing_fraction=0,
                          has_response=False, seed=seed)

    results_path = pyunit_utils.locate("results")

    savefilenamewithpath = os.path.join(results_path, 'out.svm')
    pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath, f1)  # write h2o frame to svm format

    try:
        print("Test upload SVM file. "
              "Expected result is Java exception error: skipped_columns not supported for AVRO and SVMlight")
        h2o.upload_file(savefilenamewithpath, skipped_columns=[5])
        assert False, "Test should have thrown an exception due skipped_columns parameter is present"  # should have failed here
    except H2OResponseError as e:
        assert "skipped_columns are not supported" in str(e.args[0].exception_msg), "Exception message is different"
        print("Test OK, finished with H2OResponseError")

    try:
        print("Test import SVM file. "
              "Expected result is Java exception error: skipped_columns not supported for AVRO and SVMlight")
        h2o.import_file(savefilenamewithpath, skipped_columns=[5])
        assert False, "Test should have thrown an exception due skipped_columns parameter is present"  # should have failed here
    except H2OResponseError as e:
        assert "skipped_columns are not supported" in e.args[0].exception_msg, "Exception message is different"
        print("Test OK, finished with H2OResponseError")
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(range(5000,15001),1)[0]
    dataset_params['cols'] = random.sample(range(10,21),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print "Dataset parameters: {0}".format(dataset_params)

    append_response = False
    distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   distribution == 'gaussian':  dataset_params['response_factors'] = 1
    elif distribution == 'bernoulli': dataset_params['response_factors'] = 2
    elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100)
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame.fromPython([random.randint(1,1000) for r in range(0,dataset_params['rows'])])
        append_response = True
    print "Distribution: {0}".format(distribution)

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor()
    train = train.impute("response", method="mode")
    print "Training dataset:"
    print train

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(range(1,21),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(range(1,11),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(range(1,11),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(range(2,21),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(range(2,1025),1)[0]
    if random.randint(0,1): params['learn_rate'] = random.random()
    params['distribution'] = distribution
    print "Parameter list: {0}".format(params)

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
Example #14
0
def createData(nrows, ncols):
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_airlines_file = "/datasets/airlines_all.05p.csv"

    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file)
    airlines = h2o.import_file(url)

    myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"]
    myY = "IsDepDelayed"

    allCols = list(myX)
    allCols.append(myY)

    airlines = airlines[allCols]

    num_new_features = ncols - airlines.ncol
    sample_data = h2o.create_frame(rows=nrows,
                                   cols=num_new_features,
                                   categorical_fraction=0,
                                   seed=1234,
                                   seed_for_column_types=1234)

    new_rows = nrows - airlines.nrow
    if (nrows > 0):
        extra_rows = airlines[0:nrows, :]
        airlines = airlines.rbind(extra_rows)

    airlines = airlines[0:nrows, :]
    full_data = airlines.cbind(sample_data)

    return full_data
def random_dataset(response_type,
                   verbose=True,
                   NTESTROWS=200,
                   missing_fraction=0.0,
                   seed=None):
    """Create and return a random dataset."""
    if verbose: print("\nCreating a dataset for a %s problem:" % response_type)
    fractions = {
        'real_fraction': 0.925363793458878,
        'categorical_fraction': 0.9625390964218535,
        'integer_fraction': 0.5693588274554572,
        'time_fraction': 0.19987260017514685,
        'string_fraction': 0.893090913162827,
        'binary_fraction': 0.12909731789008272
    }
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] /= 3
    fractions["time_fraction"] /= 2

    sum_fractions = sum(fractions.values())
    for k in fractions:
        fractions[k] /= sum_fractions

    response_factors = 1
    df = h2o.create_frame(rows=25000 + NTESTROWS,
                          cols=20,
                          missing_fraction=missing_fraction,
                          has_response=True,
                          response_factors=response_factors,
                          positive_response=True,
                          factors=10,
                          seed=seed,
                          **fractions)
    return df
Example #16
0
def test_binary():
    df = h2o.create_frame(
        rows=1000,
        cols=10,
        categorical_fraction=0.6,
        integer_fraction=0,
        binary_fraction=0,
        real_range=100,
        integer_range=100,
        missing_fraction=0.1,
        factors=5,
        seed=1234
    )
    params = {
        "target_num_exemplars": 100,
        "rel_tol_num_exemplars": 0.5,
        "categorical_encoding": "binary",
        "transform": "normalize"
    }
    agg = H2OAggregatorEstimator(**params)
    agg.train(training_frame=df)
    assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame"
    assert is_consistent(df.nrows, agg.aggregated_frame), \
        "Exemplar counts should sum up to number of training rows"
    assert correct_num_exemplars(agg.aggregated_frame, **params), \
        "Generated number of exemplars should match target value"
def test_high_cardinality_eigen():
    df = h2o.create_frame(rows=10000,
                          cols=10,
                          categorical_fraction=0.6,
                          integer_fraction=0,
                          binary_fraction=0,
                          real_range=100,
                          integer_range=100,
                          missing_fraction=0,
                          factors=10,
                          seed=1234)
    autoencoder = H2OAutoEncoderEstimator(categorical_encoding="eigen",
                                          reproducible=True,
                                          hidden=[50, 30],
                                          epochs=5,
                                          seed=42)
    autoencoder.train(training_frame=df)

    mojo = pyunit_utils.download_mojo(autoencoder)
    autoencoder_mojo = h2o.import_mojo(mojo["mojo_zip_path"])

    preds_ae_h2o = autoencoder.predict(df)
    preds_ae_mojo = autoencoder_mojo.predict(df)
    assert_frame_equal(preds_ae_mojo.as_data_frame(),
                       preds_ae_h2o.as_data_frame())
Example #18
0
def test_show_time():

    df = h2o.H2OFrame.from_python(
        {"A": [1, 2, 3],
         "B": ["a", "a", "b"],
         "C": ["hello", "all", "world"],
         "D": ["12MAR2015:11:00:00", "13MAR2015:12:00:00", "14MAR2015:13:00:00"]},
        column_types={"A": "numeric", "B": "enum", "C": "string", "D": "time"}
    )
    out = df.__unicode__()
    print(out)
    assert "2015-03-12 11:00:00" in out
    assert "2015-03-13 12:00:00" in out
    assert "2015-03-14 13:00:00" in out

    df2 = h2o.create_frame(cols=6, rows=10, time_fraction=1, missing_fraction=0.1)
    out2 = df2.__unicode__()
    print(out2)
    assert "e+" not in out2
    assert "E+" not in out2

    lines = out2.splitlines()[2:-2]  # skip header (first 2 lines) + footer (last 2 lines)
    regex = re.compile(r"(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)")
    for l in lines:
        for entry in l.split("  "):
            entry = entry.strip()
            if entry == "": continue  # skip missing entries
            m = re.match(regex, entry)
            assert m is not None, "Failed to recognize time expression '%s'" % entry
            year = int(m.group(1))
            month = int(m.group(2))
            day = int(m.group(3))
            assert 1970 <= year <= 2020
            assert 1 <= month <= 12
            assert 1 <= day <= 31
Example #19
0
def test_transform():
    valid_values = ["none", "standardize", "normalize", "demean", "descale"]
    df = h2o.create_frame(
        rows=100,
        cols=4,
        categorical_fraction=0.4,
        integer_fraction=0,
        binary_fraction=0,
        real_range=100,
        integer_range=100,
        missing_fraction=0,
        seed=1234
    )
    model = H2OAggregatorEstimator(target_num_exemplars=5)
    try:
        for val in valid_values:
            model.transform = val
            model.train(training_frame=df)
    except:
        assert False, "Aggregator model should be able to process all valid transform values"

    # Try with invalid value
    try:
        model = H2OAggregatorEstimator(target_num_exemplars=5, transform="some_invalid_value")
        assert False, "Passing invalid value of transform should throw an error"
    except:
        pass
Example #20
0
def random_dataset(nrow,
                   ncol,
                   realFrac=0.4,
                   intFrac=0.3,
                   enumFrac=0.3,
                   factorR=10,
                   integerR=100,
                   responseFactor=1,
                   misFrac=0.01,
                   randSeed=None):
    fractions = dict()
    fractions[
        "real_fraction"] = realFrac  # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = enumFrac
    fractions["integer_fraction"] = intFrac
    fractions["time_fraction"] = 0
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0

    df = h2o.create_frame(rows=nrow,
                          cols=ncol,
                          missing_fraction=misFrac,
                          has_response=True,
                          response_factors=responseFactor,
                          factors=factorR,
                          integer_range=integerR,
                          real_range=integerR,
                          seed=randSeed,
                          **fractions)
    print(df.types)
    return df
Example #21
0
def random_dataset(response_type, verbose=True):
    """Create and return a random dataset."""
    if verbose: print("\nCreating a dataset for a %s problem:" % response_type)
    fractions = {
        k + "_fraction": random.random()
        for k in "real categorical integer time string binary".split()
    }
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] /= 3
    fractions["time_fraction"] /= 2

    sum_fractions = sum(fractions.values())
    for k in fractions:
        fractions[k] /= sum_fractions
    response_factors = (1 if response_type == "regression" else 2 if
                        response_type == "binomial" else random.randint(3, 10))
    df = h2o.create_frame(rows=random.randint(15000, 25000) + NTESTROWS,
                          cols=random.randint(3, 20),
                          missing_fraction=random.uniform(0, 0.05),
                          has_response=True,
                          response_factors=response_factors,
                          positive_response=True,
                          factors=10,
                          **fractions)
    if verbose:
        print()
        df.show()
    return df
def createData(nrows, ncols):
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_airlines_file = "/datasets/airlines_all.05p.csv"

    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file)
    airlines = h2o.import_file(url)

    myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"]
    myY = "IsDepDelayed"

    allCols = list(myX)
    allCols.append(myY)

    airlines = airlines[allCols]

    num_new_features = ncols - airlines.ncol
    sample_data = h2o.create_frame(rows = nrows, cols = num_new_features, categorical_fraction = 0,
                                  seed = 1234, seed_for_column_types = 1234)

    new_rows = nrows - airlines.nrow
    if (nrows > 0):
      extra_rows = airlines[0:nrows, : ]
      airlines = airlines.rbind(extra_rows)

    airlines = airlines[0:nrows, : ]
    full_data = airlines.cbind(sample_data)

    return full_data
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(100, 200)), 1)[0]
    dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0]
    dataset_params['categorical_fraction'] = round(random.random(), 1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(
        left_over - round(random.uniform(0, left_over), 1), 1)
    if dataset_params['integer_fraction'] + dataset_params[
            'categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params[
                'categorical_fraction']:
            dataset_params[
                'integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params[
                'categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0, 0.01)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2, 50)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train, os.path.join(results_dir, "pca_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0, 1):
        params['max_iterations'] = random.sample(list(range(1, 1000)), 1)[0]
    if random.randint(0, 1):
        params['transform'] = random.sample(
            ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"], 1)[0]
    realNcol = train.ncol - 1
    params['k'] = random.sample(list(range(1, min(realNcol, train.nrow))),
                                1)[0]

    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="pca",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
Example #24
0
def test_cat_encoding():
    valid_values = [
        "auto", "enum", "one_hot_internal",
        "one_hot_explicit", "binary", "eigen",
        "label_encoder", "enum_limited",
        # "sort_by_response"    TODO: This is invalid parameter, remove it
    ]
    df = h2o.create_frame(
        rows=100,
        cols=4,
        categorical_fraction=0.4,
        integer_fraction=0,
        binary_fraction=0,
        real_range=100,
        integer_range=100,
        missing_fraction=0,
        seed=1234
    )
    model = H2OAggregatorEstimator(target_num_exemplars=5)
    try:
        for val in valid_values:
            model.categorical_encoding = val
            model.train(training_frame=df)
    except:
        assert False, "Aggregator model should be able to process all valid categorical_encoding values"

    # Try with invalid value
    try:
        model = H2OAggregatorEstimator(target_num_exemplars=5, categorical_encoding="some_invalid_value")
        assert False, "Passing invalid value of categorical_encoding should throw an error"
    except:
        pass
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    append_response = False
    family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   family == 'binomial':  dataset_params['response_factors'] = 2
    elif family == 'gaussian':  dataset_params['response_factors'] = 1
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])])
        append_response = True
    print("Family: {0}".format(family))

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if family == 'binomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log"))
    train = train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['alpha'] = random.random()
    params['family'] = family
    if params['family'] == "tweedie":
        if random.randint(0,1):
            params['tweedie_variance_power'] = round(random.random()+1,6)
            params['tweedie_link_power'] = 1 - params['tweedie_variance_power']
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    append_response = False
    family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   family == 'binomial':  dataset_params['response_factors'] = 2
    elif family == 'gaussian':  dataset_params['response_factors'] = 1
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame([random.randint(1,1000) for r in range(0,dataset_params['rows'])])
        append_response = True
    print("Family: {0}".format(family))

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if family == 'binomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log"))
    train = train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['alpha'] = random.random()
    params['family'] = family
    if params['family'] == "tweedie":
        if random.randint(0,1):
            params['tweedie_variance_power'] = round(random.random()+1,6)
            params['tweedie_link_power'] = 1 - params['tweedie_variance_power']
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1,
                          binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1,
                          has_response=False, seed=seed)
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_even = list(range(0, f1.ncol, 2))
    skip_odd = list(range(1, f1.ncol, 2))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
Example #28
0
def genMergedSeparaData(MergedRows, intUpper, intLow, doubleUpper, doubleLow,
                        bProb):
    # first generate the single column that will be the merge key
    merged = h2o.create_frame(rows=MergedRows,
                              cols=3,
                              integer_fraction=1,
                              integer_range=intUpper - intLow)
    print("Done, save with Flow")
Example #29
0
def create_frame_test(ip, port):

    # REALLY basic test TODO: add more checks
    r = random.randint(1, 1000)
    c = random.randint(1, 1000)

    frame = h2o.create_frame(rows=r, cols=c)
    assert frame.nrow == r and frame.ncol == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \
                                                    "cols.".format(r,c,frame.nrow,frame.ncol)
Example #30
0
def pubdev_5112():
    words = h2o.create_frame(rows=10,
                             cols=1,
                             string_fraction=1.0,
                             missing_fraction=0.0)
    embeddings = h2o.create_frame(rows=10,
                                  cols=100,
                                  real_fraction=1.0,
                                  missing_fraction=0.0)
    word_embeddings = words.cbind(embeddings)

    w2v_model = H2OWord2vecEstimator.from_external(external=word_embeddings)

    model_id = w2v_model.model_id
    model = h2o.get_model(model_id)

    assert model, "Worder2Vec model without a training frame was retrived"

    # Only leading column should be of type String
    leading_column_string_error = False
    try:
        string_frame = h2o.create_frame(rows=10,
                                        cols=10,
                                        real_fraction=1.0,
                                        missing_fraction=0.0)
        H2OWord2vecEstimator.from_external(external=string_frame)
    except H2OValueError:
        leading_column_string_error = True

    assert leading_column_string_error, "Word2Vec pre-trained model should be checked for the leading column" \
                                        " to be string"
    # Other columns should be non-string type
    multiple_string_columns_error = False
    try:
        string_frame = h2o.create_frame(rows=10,
                                        cols=10,
                                        string_fraction=1.0,
                                        missing_fraction=0.0)
        H2OWord2vecEstimator.from_external(external=string_frame)
    except H2OValueError:
        multiple_string_columns_error = True

    assert multiple_string_columns_error, "Word2Vec pre-trained model should be checked for columns not to have a" \
                                          " String type except for the leading column"
def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow,
                          cols=ncol,
                          real_fraction=frac1,
                          categorical_fraction=frac1,
                          integer_fraction=frac1,
                          binary_fraction=frac1,
                          time_fraction=frac1,
                          string_fraction=frac2,
                          missing_fraction=0.1,
                          has_response=False,
                          seed=seed)
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath,
                                            skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
Example #32
0
def pubdev_5180():

    frame = h2o.create_frame(binary_fraction=1,
                             binary_ones_fraction=0.5,
                             missing_fraction=0,
                             rows=1,
                             cols=1)
    exp_str = ExprNode("assign", 123456789123456789123456789,
                       frame)._get_ast_str()
    assert exp_str.find('123456789123456789L') == -1
Example #33
0
def isax():
    df = h2o.create_frame(rows=1,cols=256,real_fraction=1.0,missing_fraction=0.0,seed=123)
    df2 = df.cumsum(axis=1)
    res = df2.isax(num_words=10,max_cardinality=10)
    res.show()
    answer = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10"
    assert answer == res[0,0], "expected isax index to be " + answer + " but got" + res[0,0] + " instead."
    h2o.remove(df)
    h2o.remove(df2)
    h2o.remove(res)
def javapredict_dynamic_data():

    dataset_params = {}
    dataset_params['rows'] = 13183
    dataset_params['cols'] = 13
    dataset_params['categorical_fraction'] = 0.4
    dataset_params['integer_fraction'] = 0.3
    dataset_params['missing_fraction'] = 0.27539154084819495
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = 819
    print("Dataset parameters: {0}".format(dataset_params))

    problem = 2
    print(
        "Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}"
        .format(problem))
    if problem == 'binomial': dataset_params['response_factors'] = 2
    elif problem == 'regression': dataset_params['response_factors'] = 1
    else: dataset_params['response_factors'] = 16

    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial':
        train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train["response"],
        os.path.join(results_dir, "drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(
        train, os.path.join(results_dir, "drf_dynamic_training_dataset.log"))

    params = {}
    params['nbins'] = 5
    params['min_rows'] = 7
    params['mtries'] = 4
    params['sample_rate'] = 0.7867986759373544
    params['seed'] = 1304644573760597606
    print("Parameter list: {0}".format(params))

    x = list(range(1, train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
def test_parser_svmlight_column_skip():
  # generate a big frame with all datatypes and save it to svmlight
  nrow = 10000
  ncol = 50
  seed=12345

  f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=0.5, integer_fraction=0.5, missing_fraction=0.2,
                         has_response=False, seed=seed)
  f2 = h2o.create_frame(rows=nrow, cols=1, real_fraction=1, integer_fraction=0, missing_fraction=0,
                         has_response=False, seed=seed)
  f2.set_name(0,"target")
  f1 = f2.cbind(f1)

  tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results"))
  if not(os.path.isdir(tmpdir)):
    os.mkdir(tmpdir)
  savefilenamewithpath = os.path.join(tmpdir, 'out.svm')
  pyunit_utils.write_H2OFrame_2_SVMLight(savefilenamewithpath, f1) # write h2o frame to svm format

  skip_all = list(range(ncol))
  skip_even = list(range(0, ncol, 2))

  try:
    loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns = skip_all)
    sys.exit(1) # should have failed here
  except:
    pass

  try:
    importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns = skip_all)
    sys.exit(1) # should have failed here
  except:
    pass

  try:
    importFileSkipSome = h2o.import_file(savefilenamewithpath, skipped_columns = skip_even)
    sys.exit(1) # should have failed here
  except:
    pass

  # check for correct parsing only
  checkCorrectSkips(savefilenamewithpath, f1)
Example #36
0
def irf_tree_Test():
    cat_frame = h2o.create_frame(cols=10, categorical_fraction=1, seed=42)
    # check all columns are categorical
    assert set(cat_frame.types.values()) == set(['enum'])

    iso_model = H2OIsolationForestEstimator(seed=42)
    iso_model.train(training_frame=cat_frame)

    tree = H2OTree(iso_model, 5)
    check_tree(tree, 5, None)
    print(tree)
Example #37
0
def irf_tree_Test():
    cat_frame = h2o.create_frame(cols=10, categorical_fraction=1, seed=42)
    # check all columns are categorical
    assert set(cat_frame.types.values()) == set(['enum'])

    iso_model = H2OIsolationForestEstimator(seed=42)
    iso_model.train(training_frame=cat_frame)

    tree = H2OTree(iso_model, 5)
    check_tree(tree, 5, None)
    print(tree)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    problem = random.sample(list(range(0,3)),1)
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if   problem == 'binomial':    dataset_params['response_factors'] = 2
    elif problem == 'regression':  dataset_params['response_factors'] = 1
    else:                          dataset_params['response_factors'] = random.randint(3,100)


    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0]
    if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0]
    if random.randint(0,1): params['sample_rate'] = random.random()
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True,
                             **params)
def h2o_H2OFrame_concat():
    """
    Python API test: h2o.frame.H2OFrame.concat(frames, axis=1)

    Copied from pyunit_concat.py
    """
    df1 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=1)
    df2 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=2)
    df3 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=3)

    # frame to frame concat (column-wise)
    df123 = df1.concat([df2,df3])
    assert_is_type(df123, H2OFrame)     # check return type
    assert df123.shape==(df1.nrows, df1.ncols+df2.ncols+df3.ncols), "h2o.H2OFrame.concat command is not working."#

    #Frame to Frame concat (row wise)
    df123_row = df1.concat([df2,df3], axis = 0)
    assert_is_type(df123_row, H2OFrame)     # check return type
    assert df123_row.shape==(df1.nrows+df2.nrows+df3.nrows, df1.ncols), \
        "h2o.H2OFrame.concat command is not working."
Example #40
0
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    problem = random.sample(list(range(0,3)),1)
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if   problem == 'binomial':    dataset_params['response_factors'] = 2
    elif problem == 'regression':  dataset_params['response_factors'] = 1
    else:                          dataset_params['response_factors'] = random.randint(3,100)


    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0]
    if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0]
    if random.randint(0,1): params['sample_rate'] = random.random()
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True,
                             **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0]
    dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0]
    dataset_params['categorical_fraction'] = round(random.random(), 1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(
        left_over - round(random.uniform(0, left_over), 1), 1)
    if dataset_params['integer_fraction'] + dataset_params[
            'categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params[
                'categorical_fraction']:
            dataset_params[
                'integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params[
                'categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0, 0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2, 2000)
    dataset_params['response_factors'] = random.randint(3, 100)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train, os.path.join(results_dir, "nb_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['laplace'] = 0
    if random.randint(0, 1): params['laplace'] = random.uniform(0, 11)
    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="naive_bayes",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
def h2o_H2OFrame_concat():
    """
    Python API test: h2o.frame.H2OFrame.concat(frames, axis=1)

    Copied from pyunit_concat.py
    """
    df1 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=1)
    df2 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=2)
    df3 = h2o.create_frame(integer_fraction=1,binary_fraction=0,categorical_fraction=0,seed=3)

    # frame to frame concat (column-wise)
    df123 = df1.concat([df2,df3])
    assert_is_type(df123, H2OFrame)     # check return type
    assert df123.shape==(df1.nrows, df1.ncols+df2.ncols+df3.ncols), "h2o.H2OFrame.concat command is not working."#

    #Frame to Frame concat (row wise)
    df123_row = df1.concat([df2,df3], axis = 0)
    assert_is_type(df123_row, H2OFrame)     # check return type
    assert df123_row.shape==(df1.nrows+df2.nrows+df3.nrows, df1.ncols), \
        "h2o.H2OFrame.concat command is not working."
Example #43
0
def create_frame_test(ip,port):
    
    

    # REALLY basic test TODO: add more checks
    r = random.randint(1,1000)
    c = random.randint(1,1000)

    frame = h2o.create_frame(rows=r, cols=c)
    assert frame.nrow() == r and frame.ncol() == c, "Expected {0} rows and {1} cols, but got {2} rows and {3} " \
                                                    "cols.".format(r,c,frame.nrow(),frame.ncol())
Example #44
0
def h2ocreate_frame():
    """
    Python API test: h2o.create_frame(frame_id=None, rows=10000, cols=10, randomize=True, real_fraction=None,
     categorical_fraction=None, integer_fraction=None, binary_fraction=None, time_fraction=None,
      string_fraction=None, value=0, real_range=100, factors=100, integer_range=100,
      binary_ones_fraction=0.02, missing_fraction=0.01, has_response=False, response_factors=2,
      positive_response=False, seed=None, seed_for_column_types=None)

    Copied from pyunit_NOPASS_javapredict_dynamic_data_paramsDL.py
    """

    try:
        # Generate random dataset
        dataset_params = {}
        dataset_params['rows'] = random.sample(list(range(50, 150)), 1)[0]
        dataset_params['cols'] = random.sample(list(range(3, 6)), 1)[0]
        dataset_params['categorical_fraction'] = round(random.random(), 1)
        left_over = (1 - dataset_params['categorical_fraction'])
        dataset_params['integer_fraction'] = round(
            left_over - round(random.uniform(0, left_over), 1), 1)
        if dataset_params['integer_fraction'] + dataset_params[
                'categorical_fraction'] == 1:
            if dataset_params['integer_fraction'] > dataset_params[
                    'categorical_fraction']:
                dataset_params['integer_fraction'] = dataset_params[
                    'integer_fraction'] - 0.1
            else:
                dataset_params['categorical_fraction'] = dataset_params[
                    'categorical_fraction'] - 0.1
        dataset_params['missing_fraction'] = random.uniform(0, 0.5)
        dataset_params['has_response'] = False
        dataset_params['randomize'] = True
        dataset_params['factors'] = random.randint(2, 5)
        print("Dataset parameters: {0}".format(dataset_params))

        distribution = random.sample(
            ['bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma'], 1)[0]
        if distribution == 'bernoulli': dataset_params['response_factors'] = 2
        elif distribution == 'gaussian': dataset_params['response_factors'] = 1
        elif distribution == 'multinomial':
            dataset_params['response_factors'] = random.randint(3, 5)
        else:
            dataset_params['has_response'] = False
        print("Distribution: {0}".format(distribution))

        train = h2o.create_frame(**dataset_params)
        assert_is_type(train, H2OFrame)
        assert train.ncol == dataset_params[
            'cols'], "h2o.create_frame() create frame with wrong column number."
        assert train.nrow == dataset_params[
            'rows'], "h2o.create_frame() create frame with wrong row number."
    except Exception as e:
        assert False, "h2o.create_frame() command not is working."
Example #45
0
def whichmaxmin():

    #Make H2O frame
    f1 = h2o.create_frame(rows = 10000, cols = 100, categorical_fraction = 0, missing_fraction = 0,seed=1234)

    #Make comparable pandas frame
    f2 = f1.as_data_frame(use_pandas=True)

    #############################################################
    #Col wise max
    which_max_col = f1.idxmax()
    which_max_col = which_max_col.transpose()

    which_max_col_pd = f2.idxmax(axis=0)
    which_max_col_pd = h2o.H2OFrame(pd.DataFrame(which_max_col_pd,columns=["C1"]))

    diff_max_col_idx = which_max_col - which_max_col_pd

    assert diff_max_col_idx.sum() == 0

    #Col wise min
    which_min_col = f1.idxmin()
    which_min_col = which_min_col.transpose()

    which_min_col_pd = f2.idxmin(axis=0)
    which_min_col_pd = h2o.H2OFrame(pd.DataFrame(which_min_col_pd,columns=["C1"]))

    diff_min_col_idx = which_min_col - which_min_col_pd

    assert diff_min_col_idx.sum() == 0

    #############################################################
    #Row wise max
    which_max_row = f1.idxmax(axis=1)

    which_max_row_pd = f2.idxmax(axis=1)
    which_max_row_pd = h2o.H2OFrame(pd.DataFrame(which_max_row_pd,columns=["C1"]))
    which_max_row_pd = which_max_row_pd.ascharacter().lstrip("C").asnumeric() - 1 #Had to clean up before comparison (indexing was +1)

    diff_max_row_idx = which_max_row - which_max_row_pd

    assert diff_max_row_idx.sum() == 0

    #Row wise min
    which_min_row = f1.idxmin(axis=1)

    which_min_row_pd = f2.idxmin(axis=1)
    which_min_row_pd = h2o.H2OFrame(pd.DataFrame(which_min_row_pd,columns=["C1"]))
    which_min_row_pd = which_min_row_pd.ascharacter().lstrip("C").asnumeric() - 1 #Had to clean up before comparison (indexing was +1)

    diff_min_row_idx = which_min_row - which_min_row_pd

    assert diff_min_row_idx.sum() == 0
Example #46
0
def generate_models(n_models, n_rows, n_cols, n_rows_per_model, n_trees,
                    max_depth, target_dir):
    target_dir = os.path.abspath(target_dir)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    assert n_rows_per_model <= n_rows, "Not enough rows to train any model"
    assert n_rows <= n_rows_per_model * n_models, "Too many rows"
    assert os.path.isdir(target_dir), "%s is not a directory" % target_dir

    genmodel_jar = os.path.abspath(
        "../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar")
    assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar

    # Step 1: generate the dataset.
    df = h2o.create_frame(rows=n_rows,
                          cols=n_cols,
                          missing_fraction=0,
                          integer_fraction=1,
                          has_response=True,
                          response_factors=1,
                          positive_response=True)
    assert df.names == ["response"] + ["C%d" % n for n in range(1, n_cols + 1)]
    assert df.types["response"] == "real"
    assert all(v == "int" for k, v in df.types.items() if k != "response")
    print("Dataset created (%d x %d).\n" % (df.nrow, df.ncol))

    # Step 2: train and save the models
    for i in range(n_models):
        estimator = random.choice(
            [H2ORandomForestEstimator, H2OGradientBoostingEstimator])
        start_row = random.randint(0, n_rows - n_rows_per_model)
        end_row = start_row + n_rows_per_model

        # Step 2.a: train a model on a random subset of the frame `df`
        time0 = time.time()
        print("%-4d  %-30s" % (i + 1, estimator.__name__), end="")
        model = estimator(ntrees=n_trees, max_depth=max_depth)
        model.train(training_frame=df[start_row:end_row, :])
        print(" %.3fs" % (time.time() - time0), end="")

        # Step 2.b: save the model to a file
        model_file = h2o.api("GET /3/Models/%s/data" % model.model_id,
                             save_to=target_dir)
        assert os.path.exists(model_file)
        simple_file = model_file[len(target_dir) +
                                 1:] if model_file.startswith(
                                     target_dir + "/") else model_file
        print(" => %s  (%d bytes)" %
              (simple_file, os.stat(model_file).st_size))

        # Step 2.c
        h2o.remove(model)
def pubdev_6304():
    fractions = dict()
    fractions[
        "real_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = 1
    fractions["integer_fraction"] = 0
    fractions["time_fraction"] = 0
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0

    # this used to get an error message
    try:
        traindata = h2o.create_frame(rows=100,
                                     cols=2,
                                     missing_fraction=0,
                                     has_response=False,
                                     factors=9999999,
                                     seed=12345,
                                     **fractions)
    except Exception as ex:
        sys.exit(1)

    # this get an error message
    try:
        traindata = h2o.create_frame(rows=100,
                                     cols=2,
                                     missing_fraction=0,
                                     has_response=False,
                                     factors=19999999,
                                     seed=12345,
                                     **fractions)
        sys.exit(1)  # should have thrown an error
    except Exception as ex:  # expect an error here
        print(ex)
        if 'Number of factors must be <= 10,000,000' in ex.args[0].dev_msg:
            sys.exit(0)  # correct error message
        else:
            sys.exit(1)  # something else is wrong.
def h2o_H2OFrame_isax():
    """
    Python API test: h2o.frame.H2OFrame.isax(num_words, max_cardinality, optimize_card=False)

    copied from pyunit_isax.py
    """
    df = h2o.create_frame(rows=1,cols=256,real_fraction=1.0,missing_fraction=0.0,seed=123)
    df2 = df.cumsum(axis=1)
    res = df2.isax(num_words=10,max_cardinality=10, optimize_card=False)
    res.show()
    answer = "0^10_0^10_0^10_0^10_5^10_7^10_8^10_9^10_9^10_8^10"

    assert_is_type(res, H2OFrame)       # check return type
    assert answer == res[0,0], "expected isax index to be " + answer + " but got" + res[0,0] + " instead."
Example #49
0
def fillna():
    NUM_COLS = 3
    df = h2o.create_frame(rows=1000000,
                          cols=NUM_COLS,
                          real_fraction=1.0,
                          real_range=100,
                          missing_fraction=0.2,
                          seed=123)
    # Pandas comparison
    pdf = df.as_data_frame()
    filledpdf = pdf.fillna(method="ffill",axis=0,limit=3)
    filledpdfh2o = h2o.H2OFrame(filledpdf, column_types=["float"]*NUM_COLS)
    filled = df.fillna(method="forward",axis=0,maxlen=3)
    assert abs((filled - filledpdfh2o).sum(return_frame=False)) < 1e-11, "Difference between Pandas pivot too high"
def javapredict_dynamic_data():

    dataset_params = {}
    dataset_params["rows"] = 13183
    dataset_params["cols"] = 13
    dataset_params["categorical_fraction"] = 0.4
    dataset_params["integer_fraction"] = 0.3
    dataset_params["missing_fraction"] = 0.27539154084819495
    dataset_params["has_response"] = True
    dataset_params["randomize"] = True
    dataset_params["factors"] = 819
    print("Dataset parameters: {0}".format(dataset_params))

    problem = 2
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if problem == "binomial":
        dataset_params["response_factors"] = 2
    elif problem == "regression":
        dataset_params["response_factors"] = 1
    else:
        dataset_params["response_factors"] = 16

    train = h2o.create_frame(**dataset_params)
    if problem == "binomial" or problem == "multinomial":
        train["response"] = train["response"].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train, os.path.join(results_dir, "drf_dynamic_training_dataset.log"))

    params = {}
    params["nbins"] = 5
    params["min_rows"] = 7
    params["mtries"] = 4
    params["sample_rate"] = 0.7867986759373544
    params["seed"] = 1304644573760597606
    print("Parameter list: {0}".format(params))

    x = list(range(1, train.ncol))
    y = "response"

    pyunit_utils.javapredict(
        algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params
    )
Example #51
0
def pyunit_unique():

    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))
    uniques = iris[4].unique()
    rows, cols = uniques.dim
    assert rows == 3 and cols == 1, "Expected 3 rows and 1 column, but got {0} rows and {1} column".format(rows,cols)
    assert "Iris-setosa" in uniques[0], "Expected Iris-setosa to be in the set of unique species, but it wasn't"
    assert "Iris-virginica" in uniques[0], "Expected Iris-virginica to be in the set of unique species, but it wasn't"
    assert "Iris-versicolor" in uniques[0], "Expected Iris-versicolor to be in the set of unique species, but it wasn't"

    fr = h2o.create_frame(rows=5, cols=1, time_fraction=1)
    assert fr.type(0) == "time"
    uf = fr.unique()
    assert uf.type(0) == "time"
    uf.refresh()
    assert uf.type(0) == "time"
Example #52
0
def sort():
    df = h2o.create_frame(rows=10,
                          cols=3,
                          factors=10,
                          categorical_fraction=1.0/3,
                          time_fraction=1.0/3,
                          real_fraction=1.0/3,
                          real_range=100,
                          missing_fraction=0.0,
                          seed=123)
    df1 = df.sort("C1")
    assert df1[0,0] == 433225652950 # 1983-09-24 04:27:32
    assert df1[9,0] == 1532907020199 # 2018-07-29 23:30:20
    df2 = df.sort("C2")
    assert df2[0,1] == "c1.l1"
    assert df2[9,1] == "c1.l9"
    h2o.remove_all()
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(range(5000,15001),1)[0]
    dataset_params['cols'] = random.sample(range(10,21),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print "Dataset parameters: {0}".format(dataset_params)

    train = h2o.create_frame(**dataset_params)

    print "Training dataset:"
    print train

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"kmeans_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['k'] = random.sample(range(1,10),1)[0]
    if random.randint(0,1): params['max_iterations'] = random.sample(range(1,1000),1)[0]
    if random.randint(0,1): params['standardize'] = random.sample([True, False],1)[0]
    if random.randint(0,1): params['seed'] = random.sample(range(1,1000),1)[0]
    if random.randint(0,1): params['init'] = random.sample(['Random','PlusPlus','Furthest'],1)[0]
    print "Parameter list: {0}".format(params)

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="kmeans", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(100,200)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.01)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,50)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"pca_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['max_iterations'] = random.sample(list(range(1,1000)),1)[0]
    if random.randint(0,1): params['transform'] = random.sample(["NONE","STANDARDIZE","NORMALIZE","DEMEAN","DESCALE"],1)[0]
    params['k'] = random.sample(list(range(1,min(train.ncol,train.nrow))),1)[0]

    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    dataset_params['response_factors'] = random.randint(3,100)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['laplace'] = 0
    if random.randint(0,1): params['laplace'] = random.uniform(0,11)
    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
Example #56
0
def generate_models(n_models, n_rows, n_cols, n_rows_per_model, n_trees, max_depth, target_dir):
    target_dir = os.path.abspath(target_dir)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    assert n_rows_per_model <= n_rows, "Not enough rows to train any model"
    assert n_rows <= n_rows_per_model * n_models, "Too many rows"
    assert os.path.isdir(target_dir), "%s is not a directory" % target_dir

    genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar")
    assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar

    # Step 1: generate the dataset.
    df = h2o.create_frame(rows=n_rows, cols=n_cols, missing_fraction=0, integer_fraction=1,
                          has_response=True, response_factors=1, positive_response=True)
    assert df.names == ["response"] + ["C%d" % n for n in range(1, n_cols + 1)]
    assert df.types["response"] == "real"
    assert all(v == "int" for k, v in df.types.items() if k != "response")
    print("Dataset created (%d x %d).\n" % (df.nrow, df.ncol))

    # Step 2: train and save the models
    for i in range(n_models):
        estimator = random.choice([H2ORandomForestEstimator, H2OGradientBoostingEstimator])
        start_row = random.randint(0, n_rows - n_rows_per_model)
        end_row = start_row + n_rows_per_model

        # Step 2.a: train a model on a random subset of the frame `df`
        time0 = time.time()
        print("%-4d  %-30s" % (i + 1, estimator.__name__), end="")
        model = estimator(ntrees=n_trees, max_depth=max_depth)
        model.train(training_frame=df[start_row:end_row, :])
        print(" %.3fs" % (time.time() - time0), end="")

        # Step 2.b: save the model to a file
        model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir)
        assert os.path.exists(model_file)
        simple_file = model_file[len(target_dir) + 1:] if model_file.startswith(target_dir + "/") else model_file
        print(" => %s  (%d bytes)" % (simple_file, os.stat(model_file).st_size))

        # Step 2.c
        h2o.remove(model)