Esempio n. 1
0
def build_audit_h2o(classifier, name):
	mapper = DataFrameMapper(
		[([column], ContinuousDomain()) for column in ["Age", "Hours", "Income"]] +
		[([column], CategoricalDomain()) for column in ["Employment", "Education", "Marital", "Occupation", "Gender", "Deductions"]]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)
Esempio n. 2
0
def build_auto_h2o(regressor, name):
	transformer = ColumnTransformer(
		[(column, CategoricalDomain(), [column]) for column in ["cylinders", "model_year", "origin"]] +
		[(column, ContinuousDomain(), [column]) for column in ["displacement", "horsepower", "weight", "acceleration"]]
	)
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("uploader", H2OFrameCreator(column_names = ["cylinders", "model_year", "origin", "displacement", "horsepower", "weight", "acceleration"], column_types = ["enum", "enum", "enum", "numeric", "numeric", "numeric", "numeric"])),
		("regressor", regressor)
	])
	pipeline.fit(auto_X, H2OFrame(auto_y.to_frame()))
	pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13))
	regressor = pipeline._final_estimator
	store_mojo(regressor, name)
	store_pkl(pipeline, name)
	mpg = pipeline.predict(auto_X)
	mpg.set_names(["mpg"])
	store_csv(mpg.as_data_frame(), name)
Esempio n. 3
0
def predict_row(row, node):
    if node.classifier:
        if isinstance(node.classifier, h2o.estimators.H2OEstimator):
            if not isinstance(row, H2OFrame):
                column_types_row = get_h2o_column_types(row.columns)
                row = H2OFrame(row, column_types=column_types_row)
            prediction = node.classifier.predict(row)
            prediction = np.concatenate(
                prediction['predict'].as_data_frame().values)
        else:
            prediction = node.classifier.predict(row)
        if prediction[0] == 0:
            prediction = predict_row(row, node.left_node)
        else:
            prediction = predict_row(row, node.right_node)
    else:
        return list(node.classes)[0]

    return prediction
Esempio n. 4
0
def _prepare_one_hot(file, y, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    dir_path = os.path.dirname(os.path.realpath(__file__))
    frame = h2o.import_file(dir_path + "/" + file)
    train, test = frame.split_frame([0.95], seed=42)

    cols_to_encode = []
    other_cols = []
    for name, ctype in test.types.items():
        if name == y or name in exclude_cols:
            pass
        elif ctype == "enum":
            cols_to_encode.append(name)
        else:
            other_cols.append(name)
    train_frame = train.as_data_frame()
    train_encode = train_frame.loc[:, cols_to_encode]
    train_other = train_frame.loc[:, other_cols + [y]]
    enc = OneHotEncoder(categories='auto', handle_unknown='ignore')
    enc.fit(train_encode)
    colnames = []
    for cidx in range(len(cols_to_encode)):
        for val in enc.categories_[cidx]:
            colnames.append(cols_to_encode[cidx] + "." + val)

    train_encoded = enc.transform(train_encode.values).toarray()
    train_encoded = pd.DataFrame(train_encoded)
    train_encoded.columns = colnames
    train = train_other.join(train_encoded)
    train = H2OFrame(train)

    test_frame = test.as_data_frame()
    test_encode = test_frame.loc[:, cols_to_encode]
    test_other = test_frame.loc[:, other_cols]

    test_encoded = enc.transform(test_encode.values).toarray()
    test_encoded = pd.DataFrame(test_encoded)
    test_encoded.columns = colnames
    test = test_other.join(test_encoded)

    return train, test
Esempio n. 5
0
def classify(x_train, y_train, estimator, x_test):
    """
    Make the classification and provide the result for the given estimator.
    For the H2O library, the transformation in H2oFrame is integrated
    :param x_train: the dataset for training
    :param y_train: the classes for training
    :param estimator: the estimator to be considered
    :param x_test: the dataset for testing
    :return: - the prediction for the x_test
             - the trained estimator
    """
    if isinstance(estimator, h2o.estimators.H2OEstimator):
        current_estimator = fit_h2o(x_train, y_train, estimator)
        column_types_x = get_h2o_column_types(x_test.columns)
        x_test = H2OFrame(x_test, column_types=column_types_x)
        prediction = current_estimator.predict(x_test)
        return np.concatenate(
            prediction['predict'].as_data_frame().values), current_estimator
    else:
        current_estimator = clone(estimator)
        current_estimator.fit(x_train, y_train)
        return current_estimator.predict(x_test), current_estimator
Esempio n. 6
0
def pubdev_6393():
    locations = [['location'],
             ['�X県 A市 '], # First observation contains replacement character for unknown char
             ['X県 B市']]

    frame = H2OFrame(locations, header=True, column_types=['enum'])
    assert frame.ncols == 1
    assert frame.nrows == len(locations) - 1
    
    frame_categories= frame['location'].categories()
    print(frame_categories)
    
    frame_converted = frame['location'].ascharacter().asfactor()
    assert frame_converted.ncols == 1
    assert frame_converted.nrows == len(locations) - 1
    
    frame_converted_categories = frame_converted.categories();
    print(frame_converted_categories)
    
    # Check for the representation of categoricals to be exactly the same
    # No explicit check for any specific behavior, the behavior of Categorical and asFactor should be the same
    for i in range(0,len(frame_converted_categories)):
        assert frame_categories[i] == frame_converted_categories[i]
Esempio n. 7
0
def pubdev_6439():
    data = [
        ['C1'],
        [
            'X県 A市 '
        ],  # First observation contains replacement character for unknown char
        ['X県 B市']
    ]

    frame = H2OFrame(data, header=True, column_types=['enum'])

    frame_categories = frame['C1'].categories()
    print(frame_categories)

    # Two observations
    assert len(frame_categories) == 2
    assert len(
        frame_categories[0]
    ) == 6  # First observation has six characters (space at the end)
    assert len(
        frame_categories[1]
    ) == 5  # Second observation has 5 characters (missing space at the end)

    # Python 2 and 3 handle strings differently
    if (sys.version_info[0] == 3):
        assert ''.join(data[1]) == frame_categories[
            0]  # First categorical level equals to first observation
        assert ''.join(data[2]) == frame_categories[
            1]  # Second categorical levels equals to second observation
    elif (sys.version_info[0] == 2):
        assert ''.join(data[1]).decode("utf-8") == frame_categories[
            0]  # First categorical level equals to first observation
        assert ''.join(data[2]).decode("utf-8") == frame_categories[
            1]  # Second categorical levels equals to second observation
    else:
        assert False
def train_w2v(df, epochs=None, save_dir=None):
    """ trains word2vec model on all text columns of df.
        Returns w2v model object that can transform data.
    """
    print("training word2vec model ...")
    args = {}
    if epochs is not None:
        args['epochs'] = int(epochs)
    if save_dir is not None:
        args['export_checkpoints_dir'] = os.path.join(save_dir,"h2o_model/")

    df = df.copy()
    text_columns = get_text_cols(df)
    print("Text columns are: ", text_columns)
    df_text = df[text_columns]
    text_frame = H2OFrame(df_text)
    for col in text_columns:
        text_frame[col] = text_frame[col].ascharacter()

    words = text_frame.tokenize(" ")
    w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, **args)
    w2v_model.train(training_frame=words)
    w2v_model.text_columns = text_columns
    return w2v_model
Esempio n. 9
0
def yhat_h2o_classification(m, d):
    from h2o import H2OFrame
    return m.predict(H2OFrame(
        d, column_types=m._column_types)).as_data_frame().to_numpy()[:, 2]
Esempio n. 10
0
def yhat_h2o_regression(m, d):
    from h2o import H2OFrame
    return m.predict(H2OFrame(
        d, column_types=m._column_types)).as_data_frame().to_numpy().flatten()
Esempio n. 11
0
# usage:  python test_name.py --usecloud ipaddr:port
#

ip_port = sys.argv[2].split(":")
print ip_port
ip = ip_port[0]
port = int(ip_port[1])

######################################################
#
# Sample Running GBM on prostate.csv

# Connect to a pre-existing cluster
cluster = h2o.init(ip=ip, port=port)

df = H2OFrame(remote_fname="../../../smalldata/logreg/prostate.csv")
print df.describe()

# Remove ID from training frame
del df['ID']

# For VOL & GLEASON, a zero really means "missing"
vol = df['VOL']
vol[vol == 0] = None
gle = df['GLEASON']
gle[gle == 0] = None

# Convert CAPSULE to a logical factor
df['CAPSULE'] = df['CAPSULE'].asfactor()

# Test/train split
Esempio n. 12
0
                          ("Income", ContinuousDomain()),
                          (["Hours", "Income"],
                           Alias(ExpressionTransformer("X[1] / (X[0] * 52)"),
                                 "Hourly_Income"))])
classifier = H2ORandomForestEstimator(ntrees=17)

predict_proba_transformer = Pipeline([
    ("expression", ExpressionTransformer("X[1]")),
    ("cut",
     Alias(CutTransformer(bins=[0.0, 0.75, 0.90, 1.0],
                          labels=["no", "maybe", "yes"]),
           "Decision",
           prefit=True))
])

pipeline = PMMLPipeline([("local_mapper", mapper),
                         ("uploader", H2OFrameCreator()),
                         ("remote_classifier", classifier)],
                        predict_proba_transformer=predict_proba_transformer)
pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(),
                               column_types=["categorical"]))

pipeline.verify(audit_X.sample(100))

sklearn2pmml(pipeline, "pmml/RandomForestAudit.pmml")

if "--deploy" in sys.argv:
    from openscoring import Openscoring

    os = Openscoring("http://localhost:8080/openscoring")
    os.deployFile("RandomForestAudit", "pmml/RandomForestAudit.pmml")
Esempio n. 13
0
async def serve(q: Q):
    if q.args.train:
        # train WaveML Model using H2O-3 AutoML
        q.client.wave_model = build_model(
            train_df=q.client.train_df,
            target_column='target',
            model_type=ModelType.H2O3,
            _h2o3_max_runtime_secs=5,
            _h2o3_nfolds=2,
            _h2o3_include_algos=['DRF', 'XGBoost', 'GBM'])
        model_id = q.client.wave_model.model.model_id
        accuracy = round(q.client.wave_model.model.accuracy()[0][1] * 100, 2)

        # show training details and prediction option
        q.page['example'].items[1].buttons.items[1].button.disabled = False
        q.page['example'].items[2].message_bar.type = 'success'
        q.page['example'].items[
            2].message_bar.text = 'Training successfully completed!'
        q.page['example'].items[
            3].text.content = f'''**H2O AutoML model id:** {model_id} <br />
            **Accuracy:** {accuracy}%'''
        q.page['example'].items[4].text.content = ''
        q.page['example'].items[5].text.content = ''
    elif q.args.predict:
        # predict on test data
        preds = q.client.wave_model.predict(test_df=q.client.test_df)
        shaps = q.client.wave_model.model.predict_contributions(
            H2OFrame(q.client.test_df)).as_data_frame()

        # show predictions
        q.page['example'].items[
            2].message_bar.text = 'Prediction successfully completed!'
        q.page['example'].items[
            4].text.content = f'''**Example predictions:** <br />
            {preds[0]} <br /> {preds[1]} <br /> {preds[2]}'''
        q.page['example'].items[
            5].text.content = f'''**Example SHAP contributions:** <br />
            {shaps.head(3).to_html()}'''
    else:
        # prepare sample train and test dataframes
        data = load_breast_cancer(as_frame=True)['frame']
        q.client.train_df, q.client.test_df = train_test_split(data,
                                                               train_size=0.8)

        # display ui
        q.page['example'] = ui.form_card(
            box='1 1 -1 -1',
            items=[
                ui.text(content='''The sample dataset used is the
                    <a href="https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer" target="_blank">breast cancer dataset</a>.'''
                        ),
                ui.buttons(items=[
                    ui.button(name='train', label='Train', primary=True),
                    ui.button(name='predict',
                              label='Predict',
                              primary=True,
                              disabled=True),
                ]),
                ui.message_bar(type='warning',
                               text='Training will take a few seconds'),
                ui.text(content=''),
                ui.text(content=''),
                ui.text(content='')
            ])

    await q.page.save()
Esempio n. 14
0
def pav(y, X, w):
    # make H2O Frame (y, X, w)
    frame = H2OFrame(np.column_stack((y, X, w)))
    return H2OFrame._expr(expr=ExprNode("isotonic.pav", frame))[["C1", "C2"]]
Esempio n. 15
0
def get_simple_preprocessed_input_test_frame():
    doc_ids = [0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
    words = ['A', 'B', 'C', 'A', 'a', 'a', 'Z', 'C', 'c', 'B', 'C']

    return H2OFrame(OrderedDict([('DocID', doc_ids), ('Words', words)]),
                    column_types=['numeric', 'string'])
Esempio n. 16
0
def get_simple_input_test_frame():
    doc_ids = [0, 1, 2]
    documents = ['A B C', 'A a a Z', 'C c B C']

    return H2OFrame(OrderedDict([('DocID', doc_ids), ('Document', documents)]),
                    column_types=['numeric', 'string'])
Esempio n. 17
0
# Set dummy response var in test data
test[response_name_fact] = None
test[response_name_fact] = test[response_name_fact].asfactor()

# Combine data into one set for simpler processing
all_data = train.rbind(test)
train = None
test = None

# Get train and test indexes
test_idx = all_data[response_name_fact].isna()
train_idx = test_idx.logical_negation()

# Process the data to create additional features
all_data = H2OFrame(helpers.pre_pipeline_process(all_data.as_data_frame()))

# Predict Age
missing_ages_idx = all_data['Age'].isna()
unknown_ages_df = all_data[missing_ages_idx]
unknown_ages_df.pop('Age')
known_ages_df = all_data[missing_ages_idx.logical_negation()]
age_model = H2ORandomForestEstimator(seed=42)
age_model.train(
    ['Title', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Fare'],
    'Age',
    training_frame=known_ages_df)
age_prediction = age_model.predict(unknown_ages_df)
age_join_frame = age_prediction.cbind(unknown_ages_df['PassengerId'])
all_data = helpers.merge_ages(all_data, age_join_frame)
Esempio n. 18
0
def test_asserts():
    """Test type-checking functionality."""
    def assert_error(*args, **kwargs):
        """Check that assert_is_type() with given arguments throws an error."""
        try:
            assert_is_type(*args, **kwargs)
            raise RuntimeError("Failed to throw an exception")
        except H2OTypeError as exc:
            # Check whether the message can stringify properly
            message = str(exc)
            assert len(message) < 1000
            return

    class A(object):
        """Dummy A."""

    class B(A):
        """Dummy B."""

    class C(A):
        """Dummy C."""

    class D(B, C):
        """Dummy D."""

    assert_is_type(3, int)
    assert_is_type(2**100, int)
    assert_is_type("3", str)
    assert_is_type(u"3", str)
    assert_is_type("foo", u"foo")
    assert_is_type(u"foo", "foo")
    assert_is_type("I", *list("ABCDEFGHIJKL"))
    assert_is_type(False, bool)
    assert_is_type(43, str, bool, int)
    assert_is_type(4 / 3, int, float)
    assert_is_type(None, None)
    assert_is_type(None, A, str, None)
    assert_is_type([], [float])
    assert_is_type([1, 4, 5], [int])
    assert_is_type([1.0, 2, 5], [int, float])
    assert_is_type([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0]], [[int, float]])
    assert_is_type([1, None, 2], [int, float, None])
    assert_is_type({1, 5, 1, 1, 3}, {int})
    assert_is_type({1, "hello", 3}, {int, str})
    assert_is_type({"foo": 1, "bar": 2}, {str: int})
    assert_is_type({"foo": 3, "bar": [5], "baz": None}, {str: U(int, None, [int])})
    assert_is_type({"foo": 1, "bar": 2}, {"foo": int, "bar": U(int, float, None), "baz": bool})
    assert_is_type({}, {"spam": int, "egg": int})
    assert_is_type({"spam": 10}, {"spam": int, "egg": int})
    assert_is_type({"egg": 1}, {"spam": int, "egg": int})
    assert_is_type({"egg": 1, "spam": 10}, {"spam": int, "egg": int})
    assert_is_type({"egg": 1, "spam": 10}, Dict(egg=int, spam=int))
    assert_is_type({"egg": 1, "spam": 10}, Dict(egg=int, spam=int, ham=U(int, None)))
    assert_is_type((1, 3), (int, int))
    assert_is_type(("a", "b", "c"), (int, int, int), (str, str, str))
    assert_is_type((1, 3, 4, 7, 11, 18), Tuple(int))
    assert_is_type((1, 3, "spam", 3, "egg"), Tuple(int, str))
    assert_is_type([1, [2], [{3}]], [int, [int], [{3}]])
    assert_is_type(A(), None, A)
    assert_is_type(B(), None, A)
    assert_is_type(C(), A, B)
    assert_is_type(D(), I(A, B, C))
    assert_is_type(A, type)
    assert_is_type(B, lambda aa: issubclass(aa, A))
    for a in range(-2, 5):
        assert_is_type(a, -2, -1, 0, 1, 2, 3, 4)
    assert_is_type(1, numeric)
    assert_is_type(2.2, numeric)
    assert_is_type(1, I(numeric, object))
    assert_is_type(34, I(int, NOT(0)))
    assert_is_type(["foo", "egg", "spaam"], [I(str, NOT("spam"))])
    assert_is_type(H2OFrame(), h2oframe)
    assert_is_type([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0, 0]],
                   I([[numeric]], lambda v: all(len(vi) == len(v[0]) for vi in v)))
    assert_is_type([None, None, float('nan'), None, "N/A"], [None, "N/A", I(float, math.isnan)])

    assert_error(3, str)
    assert_error(0, float)
    assert_error("Z", *list("ABCDEFGHIJKL"))
    assert_error(u"Z", "a", "...", "z")
    assert_error("X", u"x")
    assert_error(0, bool)
    assert_error(0, float, str, bool, None)
    assert_error([1, 5], [float])
    assert_error((1, 3), (int, str), (str, int), (float, float))
    assert_error(A(), None, B)
    assert_error(A, A)
    assert_error(A, lambda aa: issubclass(aa, B))
    assert_error(135, I(int, lambda x: 0 <= x <= 100))
    assert_error({"foo": 1, "bar": "2"}, {"foo": int, "bar": U(int, float, None)})
    assert_error(3, 0, 2, 4)
    assert_error(None, numeric)
    assert_error("sss", numeric)
    assert_error(B(), I(A, B, C))
    assert_error(2, I(int, str))
    assert_error(0, I(int, NOT(0)))
    assert_error(None, NOT(None))
    assert_error((1, 3, "2", 3), Tuple(int))
    assert_error({"spam": 10}, Dict(spam=int, egg=int))
    assert_error({"egg": 5}, Dict(spam=int, egg=int))
    assert_error(False, h2oframe, pandas_dataframe, numpy_ndarray)
    assert_error([[2.0, 3.1, 0], [2, 4.4, 1.1], [-1, 0]],
                 I([[numeric]], lambda v: all(len(vi) == len(v[0]) for vi in v)))
    try:
        # Cannot use `assert_error` here because typechecks module cannot detect args in (*args, *kwargs)
        assert_is_type(10000000, I(int, lambda port: 1 <= port <= 65535))
        assert False, "Failed to throw an exception"
    except H2OTypeError as e:
        assert "integer & 1 <= port <= 65535" in str(e), "Bad error message: '%s'" % e

    url_regex = r"^(https?)://((?:[\w-]+\.)*[\w-]+):(\d+)/?$"
    assert_matches("Hello, world!", r"^(\w+), (\w*)!$")
    assert_matches("http://127.0.0.1:3233/", url_regex)
    m = assert_matches("https://localhost:54321", url_regex)
    assert m.group(1) == "https"
    assert m.group(2) == "localhost"
    assert m.group(3) == "54321"

    x = 5
    assert_satisfies(x, x < 1000)
    assert_satisfies(x, x ** x > 1000)
    assert_satisfies(url_regex, url_regex.lower() == url_regex)
    try:
        assert_satisfies(url_regex, url_regex.upper() == url_regex)
    except H2OValueError as e:
        assert "url_regex.upper() == url_regex" in str(e), "Error message is bad: " + str(e)

    try:
        import pandas
        import numpy
        assert_is_type(pandas.DataFrame(), pandas_dataframe)
        assert_is_type(numpy.ndarray(shape=(5,)), numpy_ndarray)
    except ImportError:
        pass
Esempio n. 19
0
def bernoulli_synthetic_data_mediumGBM(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R)
    train_rows = 10000
    train_cols = 10

    #  Generate variables V1, ... V10
    X_train = np.random.randn(train_rows, train_cols)

    #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
    y_train = np.asarray([
        1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1
        for rs in [sum(r) for r in np.multiply(X_train, X_train).tolist()]
    ])

    # Train scikit gbm
    # TODO: grid-search
    distribution = "bernoulli"
    ntrees = 150
    min_rows = 1
    max_depth = 2
    learn_rate = .01
    nbins = 20

    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate,
                                                  n_estimators=ntrees,
                                                  max_depth=max_depth,
                                                  min_samples_leaf=min_rows,
                                                  max_features=None)
    gbm_sci.fit(X_train, y_train)

    # Generate testing dataset
    test_rows = 2000
    test_cols = 10

    #  Generate variables V1, ... V10
    X_test = np.random.randn(test_rows, test_cols)

    #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
    y_test = np.asarray([
        1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1
        for rs in [sum(r) for r in np.multiply(X_test, X_test).tolist()]
    ])

    # Score (AUC) the scikit gbm model on the test data
    auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:, 1])

    # Compare this result to H2O
    train_h2o = H2OFrame(np.column_stack((y_train, X_train)).tolist())
    test_h2o = H2OFrame(np.column_stack((y_test, X_test)).tolist())

    gbm_h2o = h2o.gbm(x=train_h2o[1:],
                      y=train_h2o["C1"].asfactor(),
                      distribution=distribution,
                      ntrees=ntrees,
                      min_rows=min_rows,
                      max_depth=max_depth,
                      learn_rate=learn_rate,
                      nbins=nbins)
    gbm_perf = gbm_h2o.model_performance(test_h2o)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert abs(auc_h2o - auc_sci) < 5e-3, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \
                               "scickit auc: {1}".format(auc_h2o, auc_sci)
Esempio n. 20
0
def _data_transform(data: InputData) -> H2OFrame:
    conc_data = np.concatenate((data.features, data.target.reshape(-1, 1)), 1)
    frame = H2OFrame(python_obj=conc_data)
    return frame