Ejemplo n.º 1
0
def h2omake_metrics():
    """
    Python API test: h2o.make_metrics(predicted, actual, domain=None, distribution=None)

    Copied from pyunit_make_metrics.py
    """
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()

    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1,
                                         learn_rate=0.01, nbins=20)
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)
    assert_is_type(m1, H2OMultinomialModelMetrics)
    assert_is_type(m2, H2OMultinomialModelMetrics)
    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
Ejemplo n.º 2
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')
    model_id = config.get('model_id')

    df = h2o.get_frame(frame_id)
    column_header = params.get('column_header')
    if len(column_header) > 0:
        df_head = df[:int(column_header)]
        df = df[int(column_header):]

    pred_model = h2o.get_model(model_id)

    df_pred = pred_model.predict(df)
    df_pred.columns = [x[len('reconstr_'):] for x in df_pred.columns]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))

    if to_bool(params.get('topn_output')):
        df_topn = get_topN(df_pred, int(params.get('topn_percent')))
        if df_head is not None:
            df_topn = df_head.cbind(df_topn)
        h2o.assign(df_topn, dest_frame_id)
        h2o.remove(str(df_pred.frame_id))
    else:
        h2o.assign(df_pred, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 3
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    from h2o.estimators import H2OWord2vecEstimator
    w2v_model = H2OWord2vecEstimator(
        epochs=int(params.get('epochs')),
        init_learning_rate=float(params.get('init_learning_rate')),
        max_runtime_secs=float(params.get('max_runtime_secs')),
        min_word_freq=int(params.get('min_word_freq')),
        sent_sample_rate=float(params.get('sent_sample_rate')),
        vec_size=int(params.get('vec_size')),
        window_size=int(params.get('window_size')))

    w2v_model.train(training_frame=df)

    save_model(params, w2v_model.model_id)

    is_transform = params.get("is_transform")
    if is_transform is not None and to_bool(is_transform):
        df_vecs = w2v_model.transform(
            df, aggregate_method=params.get('aggregate_method'))
        dest_frame_id = append_frame_id(frame_id,
                                        params.get('transform_suffix'))
        h2o.assign(df_vecs, dest_frame_id)
    else:
        dest_frame_id = frame_id

    return {'frame_id': dest_frame_id, 'model_id': w2v_model.model_id}
Ejemplo n.º 4
0
    def _fetch_state(aml_id, properties=None):
        state_json = h2o.api("GET /99/AutoML/%s" % aml_id)
        project_name = state_json["project_name"]
        if project_name is None:
            raise H2OValueError("No AutoML instance with id {}.".format(aml_id))

        leaderboard_list = [key["name"] for key in state_json['leaderboard']['models']]
        leader_id = leaderboard_list[0] if (leaderboard_list is not None and len(leaderboard_list) > 0) else None

        should_fetch = lambda prop: properties is None or prop in properties

        leader = None
        if should_fetch('leader'):
            leader = h2o.get_model(leader_id) if leader_id is not None else None

        leaderboard = None
        if should_fetch('leaderboard'):
            leaderboard = H2OAutoML._fetch_table(state_json['leaderboard_table'], key=project_name+"_leaderboard", progress_bar=False)
            leaderboard = h2o.assign(leaderboard[1:], project_name+"_leaderboard")  # removing index and reassign id to ensure persistence on backend

        event_log = None
        if should_fetch('event_log'):
            event_log = H2OAutoML._fetch_table(state_json['event_log_table'], key=project_name+"_eventlog", progress_bar=False)
            event_log = h2o.assign(event_log[1:], project_name+"_eventlog")  # removing index and reassign id to ensure persistence on backend

        return dict(
            project_name=project_name,
            json=state_json,
            leader_id=leader_id,
            leader=leader,
            leaderboard=leaderboard,
            event_log=event_log,
        )
Ejemplo n.º 5
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    target_column = params.get("target_column")
    analyzer = params.get("analyzer")
    if len(analyzer) > 0:
        url = params.get("url")
        df_token = df[target_column].tokenize(
            f'tokenize:elasticsearch:{url}?analyzer={analyzer}_analyzer')
    else:
        df_token = df[target_column].tokenize(params.get('regex'))

    if to_bool(params.get('lower_case')):
        df_token = df_token.tolower()

    min_word_len = int(params.get('min_word_len'))
    if min_word_len > 0:
        df_token = df_token[(df_token.nchar() >= min_word_len) |
                            (df_token.isna()), :]

    if to_bool(params.get('use_stop_words')):
        df_token = df_token[(df_token.isna()) |
                            (~df_token.isin(STOP_WORDS)), :]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_token, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 6
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')
    model_id = config.get('model_id')

    df = h2o.get_frame(frame_id)

    input_columns = params.get("input_columns")
    if input_columns is None or len(input_columns) <= 2:
        input_columns = df.col_names
    else:
        input_columns = json.loads(input_columns)

    output_columns = params.get("output_columns")
    if output_columns is None or len(output_columns) <= 2:
        output_columns = []
    else:
        output_columns = json.loads(output_columns)

    pred_model = h2o.get_model(model_id)

    df_pred = pred_model.predict(df[input_columns])
    for col_name in output_columns:
        df_pred[col_name] = df[col_name]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_pred, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 7
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    column = params.get('column')
    value = params.get('value')
    c_type = df.types[column]

    if c_type == 'real':
        value = float(value)
    elif c_type == 'int':
        value = int(value)
    elif c_type == 'enum':
        for c in df[column].categories():
            if value == c:
                value = c
                break

    row_conditions = params.get('row_conditions')
    if row_conditions is not None and len(row_conditions) > 0:
        mask = parse_row_condition(df, row_conditions)
        df[mask, column] = value
    else:
        df[column] = value

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 8
0
 def frame_id(self, value):
   oldname = self.frame_id
   keep    = self._ast is None
   if keep:
     h2o.assign(self,value)
   else:
     self._id = value
     h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))
Ejemplo n.º 9
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    df_pivot = df.pivot(index=params.get('index'), column=params.get('column'), value=params.get('value'))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_pivot, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 10
0
def prepare_data(seed=1):
    name = 'amldataset'
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"), destination_frame=name)
    target = "CAPSULE"
    df[target] = df[target].asfactor()
    h2o.assign(df, name)
    fr = df.split_frame(ratios=[.8, .1],
                        destination_frames=[name+'_'+f for f in ['training', 'validation', 'leaderboard']],
                        seed=seed)
    train, valid, test = fr[0], fr[1], fr[2]
    return target, train, valid, test
Ejemplo n.º 11
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    df_fillna = df.fillna(method=params.get('method'),
                          axis=int(params.get('axis')),
                          maxlen=int(params.get('maxlen')))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_fillna, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 12
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    bind_frame_id = params.get('bind_frame_id')
    df_2 = h2o.get_frame(bind_frame_id)

    df_bind = df.cbind(df_2)

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_bind, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 13
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    row_conditions = params.get('row_conditions')
    if row_conditions is not None and len(row_conditions) > 0:
        mask = parse_row_condition(df, row_conditions)
        df = df[mask, :]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 14
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    column = params.get('column')
    ascending = to_bool(params.get('ascending'))

    df_sort = df.sort(by=[column], ascending=[ascending])

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_sort, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 15
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is not None or len(columns) > 2:
        columns = json.loads(columns)
        df = df[columns]

    df_floor = df.floor()

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_floor, dest_frame_id)

    return {'frame_id': dest_frame_id}
def test_use_kfold_strategy_to_train_a_model_with_cv():
    #XXX: TE KFold strategy allows TE to be trained only once in a context of model building with CV,
    # but it can't be applied just once on the training data,
    # otherwise this is what's happening when training CV1 for example (fold1 = cv_holdout, f2-n = cv_train):
    #     column `cat_te` for cv_holdout is obtained using fold_1 so, only with information collected from folds_2-n, which is what we want.
    #     column `cat_te` for cv_train however is obtained using fold_i, and each of those contains information about fold_1: this is a data leakage from cv_holdout into cv_train.
    #     on top of this, current version of transform is using a global priorMean for NAs, creating an additional data leakage in CV context.
    # The priorMean issue can be fixed internally in the implementation of KFold strategy.
    # However, for proper CCV, we need a deep integration with CV logic in ModelBuilder (translate to Java of course..):
    #   train TE using KFold strategy on the entire train set.
    #   then during CV, for each fold:
    #     train_cv_i = te.transform(train_cv, fold=fold_i)  # so that train_cv_i is not encoded at all with encodings from other folds (they include info about current fold)
    #     test_cv_i = te.transform(test_cv, fold=fold_i)    # same
    #   finally, the final model is trained with TE applied on the whole training frame:
    #     train = te.transform(train, as_training=True)     # still using the fold column, this ensures that the final feature is equivalent to the one used in all the test_cv_i
    #     or
    #     train = te.transform(train)                       # ignoring the fold column, this way the final te feature uses the entire train set.

    ds = load_dataset(incl_test=True, incl_foldc=True)
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")

    train_enc_cv = te.transform(ds.train, as_training=True)
    cols_to_remove = [n[:-3] for n in train_enc_cv.names if n.endswith("_te")]
    train_enc_cv = h2o.assign(train_enc_cv.drop(cols_to_remove),
                              "train_enc_cv")

    train_enc_no_cv = te.transform(ds.train)
    train_enc_no_cv = h2o.assign(train_enc_no_cv.drop(cols_to_remove),
                                 "train_enc_no_cv")

    test_enc = te.transform(ds.test)
    test_enc = h2o.assign(test_enc.drop(cols_to_remove), "test_enc")

    print(train_enc_cv)
    print(train_enc_no_cv)

    gbm = H2OGradientBoostingEstimator(seed=seed)
    gbm.train(y=ds.target, training_frame=train_enc_cv, fold_column="foldc")
    auc_with_ccv = gbm.model_performance(test_enc).auc()
    print("AUC with CCV : %s" % auc_with_ccv)

    gbm.train(y=ds.target, training_frame=train_enc_no_cv, fold_column="foldc")
    auc_no_ccv = gbm.model_performance(test_enc).auc()
    print("AUC without CCV : %s" % auc_no_ccv)

    assert auc_with_ccv > auc_no_ccv
Ejemplo n.º 17
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    frames = params.get('frames')
    if frames is None or len(frames) <= 2:
        print("frames are empty.")
        sys.exit(1)
    frames = json.loads(frames)

    df_concat = df.concat([h2o.get_frame(x) for x in frames], axis=int(params.get('axis')))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_concat, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 18
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is None or len(columns) <= 2:
        columns = df.columns
    else:
        columns = json.loads(columns)

    df_filtered = df[columns]

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_filtered, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 19
0
 def _fetch_leaderboard(aml_id, extensions=None):
     assert_is_type(extensions, None, str, [str])
     extensions = ([] if extensions is None
                   else [extensions] if is_type(extensions, str)
                   else extensions)
     resp = h2o.api("GET /99/Leaderboards/%s" % aml_id, data=dict(extensions=extensions))
     dest_key = resp['project_name'].split('@', 1)[0]+"_custom_leaderboard"
     lb = H2OAutoML._fetch_table(resp['table'], key=dest_key, progress_bar=False)
     return h2o.assign(lb[1:], dest_key)
Ejemplo n.º 20
0
def h2omake_metrics_mutlinomial():
    """
    Python API test: h2o.make_metrics(predicted, actual, domain=None, distribution=None)

    Copied from pyunit_make_metrics.py
    """
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()

    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="multinomial",
                                         ntrees=2,
                                         max_depth=3,
                                         min_rows=1,
                                         learn_rate=0.01,
                                         nbins=20,
                                         auc_type="MACRO_OVR")
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted,
                          actual,
                          domain=domain,
                          auc_type="MACRO_OVR")
    m2 = h2o.make_metrics(predicted, actual, auc_type="MACRO_OVR")
    assert_is_type(m1, H2OMultinomialModelMetrics)
    assert_is_type(m2, H2OMultinomialModelMetrics)
    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
    assert abs(m0.auc() - m1.auc()) < 1e-5
    assert abs(m0.aucpr() - m1.aucpr()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
    assert abs(m2.auc() - m1.auc()) < 1e-5
    assert abs(m2.aucpr() - m1.aucpr()) < 1e-5
Ejemplo n.º 21
0
def pyunit_assign():

    pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    pq = pros.quantile()

    PSA_outliers = pros[pros["PSA"] <= pq[1,1] or pros["PSA"] >= pq[1,9]]
    PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers")
    pros.head(show=True)
    PSA_outliers.head(show=True)
    assert PSA_outliers._id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers._id)
Ejemplo n.º 22
0
def pyunit_assign(ip,port):

    pros = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    pq = pros.quantile()

    PSA_outliers = pros[pros["PSA"] <= pq[1,1] or pros["PSA"] >= pq[1,9]]
    PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers")
    pros.head(show=True)
    PSA_outliers.head(show=True)
    assert PSA_outliers._id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers._id)
Ejemplo n.º 23
0
def h2oassign():
    """
    Python API test: h2o.assign(data, xid)
    """
    old_name = "benign.csv"
    new_name = "newBenign.csv"
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"), destination_frame=old_name)
    assert training_data.frame_id==old_name, "h2o.import_file() is not working.  Wrong frame_id is assigned."
    temp=h2o.assign(training_data, new_name)
    assert_is_type(temp, H2OFrame)
    assert training_data.frame_id==new_name, "h2o.assign() is not working.  New frame_id is not assigned."
Ejemplo n.º 24
0
def pyunit_assign():

    pros = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pq = pros.quantile()
    print('1st percentile for PSA:', pq[0,7])
    print('99th percentile for PSA:', pq[8,7])
    PSA_outliers = pros[ ((pros["PSA"] <= pq[0,7]) | (pros["PSA"] >= pq[8,7])) ]
    PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers")
    print(pros)
    print(PSA_outliers)
    assert PSA_outliers.frame_id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(PSA_outliers.frame_id)
Ejemplo n.º 25
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    columns = params.get('columns')
    if columns is not None and len(columns) > 2:
        columns = json.loads(columns)
        df = df[columns]

    use_value = params.get('use')
    if use_value is not None and len(use_value) == 0:
        use_value = None
    df_cor = df.cor(na_rm=to_bool(params.get('na_rm')),
                    use=use_value,
                    method=params.get('method'))

    dest_frame_id = append_frame_id(frame_id, params.get('suffix'))
    h2o.assign(df_cor, dest_frame_id)

    return {'frame_id': dest_frame_id}
Ejemplo n.º 26
0
def javamunge_assembly():
    h2o.remove_all()
    train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv")
    test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv")

    # lending-club munging assembly
    print("Import and Parse data")
    # Add "earliest_cr_line" and "issue_d" and cast as strings to aide Cliff's PR on 7/13
    types = {"int_rate": "string", "revol_util": "string", "emp_length": "string", "earliest_cr_line": "string",
             "issue_d": "string", "last_credit_pull_d": "factor"}

    data = h2o.import_file(path=train, col_types=types)
    test = h2o.import_file(path=test,  col_types=data.types) ## use the same data types as the training set for the test set
    test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:]
    test = h2o.assign(test,"test")

    assembly = H2OAssembly(
      steps=[
        # munge int_rate column in place
        # strip %, trim ws, convert to double
        ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="int_rate", inplace=True, pattern="%", replacement="")),  # strip %
        ("intrate_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="int_rate", inplace=True)),                               # trim ws
        ("intrate_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)),                               # string -> double

        # munge the revol_util in the same way as the int_rate column
        ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="revol_util", inplace=True, pattern="%", replacement="")),  # strip %
        ("revol_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="revol_util", inplace=True)),                               # trim ws
        ("revol_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)),                               # string -> double

        # munge earliest_cr_line column (mm-YYYY format)
        # split into Month and Year columns
        ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")),  # split on '-'
        ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)),                                                              # string -> double

        # munge issue_d column in same way as earliest_cr_line column
        ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")),                               # split on '-'
        ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)),                                                                                # string -> double

        # do some munging of the emp_length column
        ("emp_length_rm_years",  H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")),  # remove " year" and " years", also translate n/a to ""
        ("emp_length_trim",      H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)),                                                     # trim all the WS off
        ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1",    replacement="0.5")),                # translate < 1 => 0.5
        ("emp_length_10plus",    H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+",    replacement="10")),               # translate 10+ to 10
        ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)),                                                # string -> double

        # compute credit length
        ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year")))

      ])

    res = assembly.fit(data)
    pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
def test_loo_requires_target_to_encode_training_frame():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out")
    te.train(y=ds.target, training_frame=ds.train)

    train_no_target = h2o.assign(ds.train.drop(ds.target), "train_no_target")
    assert train_no_target is not None
    try:
        te.transform(train_no_target, as_training=True)
        assert False, "should have raised"
    except Exception as e:
        assert "LeaveOneOut strategy requires a response column" in str(e)

    assert te.predict(train_no_target) is not None
Ejemplo n.º 28
0
def pyunit_assign():

    pros = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pq = pros.quantile()
    print '1st percentile for PSA:', pq[0, 7]
    print '99th percentile for PSA:', pq[8, 7]
    PSA_outliers = pros[((pros["PSA"] <= pq[0, 7]) |
                         (pros["PSA"] >= pq[8, 7]))]
    PSA_outliers = h2o.assign(PSA_outliers, "PSA.outliers")
    print pros
    print PSA_outliers
    assert PSA_outliers.frame_id == "PSA.outliers", "Expected frame id to be PSA.outliers, but got {0}".format(
        PSA_outliers.frame_id)
Ejemplo n.º 29
0
def javamunge_assembly():
    h2o.remove_all()
    train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv")
    test  = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv")

    # lending-club munging assembly
    print("Import and Parse data")
    types = {"int_rate":"String", "revol_util":"String", "emp_length":"String"}
    data = h2o.import_file(path=train, col_types=types)
    test = h2o.import_file(path=test,  col_types=types)
    test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:]
    test = h2o.assign(test,"test")

    assembly = H2OAssembly(
      steps=[
        # munge int_rate column in place
        # strip %, trim ws, convert to double
        ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="int_rate", inplace=True, pattern="%", replacement="")),  # strip %
        ("intrate_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="int_rate", inplace=True)),                               # trim ws
        ("intrate_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)),                               # string -> double

        # munge the revol_util in the same way as the int_rate column
        ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="revol_util", inplace=True, pattern="%", replacement="")),  # strip %
        ("revol_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="revol_util", inplace=True)),                               # trim ws
        ("revol_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)),                               # string -> double

        # munge earliest_cr_line column (mm-YYYY format)
        # split into Month and Year columns
        ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")),  # split on '-'
        ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)),                                                              # string -> double

        # munge issue_d column in same way as earliest_cr_line column
        ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")),                               # split on '-'
        ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)),                                                                                # string -> double

        # do some munging of the emp_length column
        ("emp_length_rm_years",  H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")),  # remove " year" and " years", also translate n/a to ""
        ("emp_length_trim",      H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)),                                                     # trim all the WS off
        ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1",    replacement="0.5")),                # translate < 1 => 0.5
        ("emp_length_10plus",    H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+",    replacement="10")),               # translate 10+ to 10
        ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)),                                                # string -> double

        # compute credit length
        ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year")))

      ])

    res = assembly.fit(data)
    pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
Ejemplo n.º 30
0
def _fetch_table(table, key=None, progress_bar=True):
    try:
        # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users.
        # If any failure happens, revert back to user's original setting for progress and display the error message.
        ori_progress_state = H2OJob.__PROGRESS_BAR__
        H2OJob.__PROGRESS_BAR__ = progress_bar
        # Parse leaderboard H2OTwoDimTable & return as an H2OFrame
        fr = h2o.H2OFrame(table.cell_values,
                          destination_frame=key,
                          column_names=table.col_header,
                          column_types=table.col_types)
        return h2o.assign(
            fr[1:], key
        )  # removing index and reassign id to ensure persistence on backend
    finally:
        H2OJob.__PROGRESS_BAR__ = ori_progress_state
import h2o

h2o.init()


datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/"
data = h2o.import_file(datasets + "iris_wheader.csv")

data.frame_id

data = data[:, 1:]

data.frame_id

data = h2o.assign(data, "iris")
data.frame_id

h2o.ls()
h2o.remove("iris_wheader.hex")
h2o.ls()
Ejemplo n.º 32
0
  def split_frame(self, ratios=None, destination_frames=None, seed=None):
    """
    Split a frame into distinct subsets of size determined by the given ratios.
    The number of subsets is always 1 more than the number of ratios given.

    :param ratios: The fraction of rows for each split.
    :param destination_frames: names of the split frames
    :param seed: Random seed
    :return: a list of frames
    """

    if ratios is None:
      ratios = [0.75]

    if len(ratios) < 1:
      raise ValueError("Ratios must have length of at least 1")

    if destination_frames is not None:
      if (len(ratios)+1) != len(destination_frames):
        raise ValueError("The number of provided destination_frames must be one more than the number of provided ratios")

    num_slices = len(ratios) + 1
    boundaries = []

    last_boundary = 0
    i = 0
    while i < num_slices-1:
      ratio = ratios[i]
      if ratio < 0:
        raise ValueError("Ratio must be greater than 0")
      boundary = last_boundary + ratio
      if boundary >= 1.0:
        raise ValueError("Ratios must add up to less than 1.0")
      boundaries.append(boundary)
      last_boundary = boundary
      i += 1

    splits = []
    tmp_runif = self.runif(seed)

    i = 0
    while i < num_slices:
      if i == 0:
        # lower_boundary is 0.0
        upper_boundary = boundaries[i]
        tmp_slice = self[(tmp_runif <= upper_boundary), :]
      elif i == num_slices-1:
        lower_boundary = boundaries[i-1]
        # upper_boundary is 1.0
        tmp_slice = self[(tmp_runif > lower_boundary), :]
      else:
        lower_boundary = boundaries[i-1]
        upper_boundary = boundaries[i]
        tmp_slice = self[((tmp_runif > lower_boundary) & (tmp_runif <= upper_boundary)), :]

      if destination_frames is None:
        splits.append(tmp_slice)
      else:
        destination_frame_id = destination_frames[i]
        tmp_slice2 = h2o.assign(tmp_slice, destination_frame_id)
        splits.append(tmp_slice2)

      i += 1

    return splits
Ejemplo n.º 33
0
def pyunit_make_metrics():
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()
    fr.describe()

    response = "AGE"
    predictors = list(set(fr.names) - {"ID", response})

    print("\n\n======= REGRESSION ========\n")
    for distr in ["gaussian", "poisson", "laplace", "gamma"]:
        print("distribution: %s" % distr)
        model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3,
                    min_rows=1, learn_rate=0.1, nbins=20)
        model.train(x=predictors, y=response, training_frame=fr)
        predicted = h2o.assign(model.predict(fr), "pred")
        actual = fr[response]

        m0 = model.model_performance(train=True)
        m1 = h2o.make_metrics(predicted, actual, distribution=distr)
        m2 = h2o.make_metrics(predicted, actual)
        print("model performance:")
        print(m0)
        print("make_metrics (distribution=%s):" % distr)
        print(m1)
        print("make_metrics (distribution=None):")
        print(m2)

        assert abs(m0.mae() - m1.mae()) < 1e-5
        assert abs(m0.mse() - m1.mse()) < 1e-5
        assert abs(m0.rmse() - m1.rmse()) < 1e-5
        assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5
        assert abs(m2.mae() - m1.mae()) < 1e-5
        assert abs(m2.mse() - m1.mse()) < 1e-5
        assert abs(m2.rmse() - m1.rmse()) < 1e-5
        assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian")


    print("\n\n======= BINOMIAL ========\n")
    response = "CAPSULE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator()
    model.train(x=predictors, y=response, distribution="bernoulli", training_frame=fr, ntrees=2, max_depth=3,
                min_rows=1, learn_rate=0.01, nbins=20)
    predicted = h2o.assign(model.predict(fr)[2], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = ["0", "1"]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)
    print("m0:")
    print(m0)
    print("m1:")
    print(m1)
    print("m2:")
    print(m2)

    assert abs(m0.auc() - m1.auc()) < 1e-5
    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5
    assert abs(m2.auc() - m1.auc()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5


    print("\n\n======= MULTINOMIAL ========\n")
    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator()
    model.train(x=predictors, y=response, distribution="multinomial", training_frame=fr, ntrees=2, max_depth=3,
                min_rows=1, learn_rate=0.01, nbins=20)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)

    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
Ejemplo n.º 34
0
data["C3"] = data["C3"].asnumeric()
data["C4"] = data["C4"].asfactor()
data["C5"] = data["C5"].asnumeric()
data["C6"] = data["C6"].asfactor()
data["C7"] = data["C7"].asfactor()
data["C8"] = data["C8"].asfactor()
data["C9"] = data["C9"].asfactor()
data["C10"] = data["C10"].asfactor()
data["C11"] = data["C11"].asnumeric()
data["C12"] = data["C12"].asnumeric()
data["C13"] = data["C13"].asnumeric()
data["C14"] = data["C14"].asfactor()
data["C15"] = data["C15"].asfactor()

train, test = data.split_frame([0.8])
h2o.assign(train, "train_rf")
h2o.assign(test, "test_rf")

# Declare model
m = H2ORandomForestEstimator(model_id="income_rf",
                             ignore_const_cols=True,
                             ntrees=100,
                             stopping_metric="logloss",
                             stopping_rounds=3,
                             stopping_tolerance=0.02,
                             max_runtime_secs=60,
                             nfolds=10)

m.train(x, y, train)

performance = m.model_performance(test)
Ejemplo n.º 35
0
def pyunit_make_metrics():
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()
    fr.describe()

    response = "AGE"
    predictors = list(set(fr.names) - {"ID", response})

    print("\n\n======= REGRESSION ========\n")
    for distr in ["gaussian", "poisson", "laplace", "gamma"]:
        print("distribution: %s" % distr)
        model = H2OGradientBoostingEstimator(distribution=distr, ntrees=2, max_depth=3,
                    min_rows=1, learn_rate=0.1, nbins=20)
        model.train(x=predictors, y=response, training_frame=fr)
        predicted = h2o.assign(model.predict(fr), "pred")
        actual = fr[response]

        m0 = model.model_performance(train=True)
        m1 = h2o.make_metrics(predicted, actual, distribution=distr)
        m2 = h2o.make_metrics(predicted, actual)
        print("model performance:")
        print(m0)
        print("make_metrics (distribution=%s):" % distr)
        print(m1)
        print("make_metrics (distribution=None):")
        print(m2)

        assert abs(m0.mae() - m1.mae()) < 1e-5
        assert abs(m0.mse() - m1.mse()) < 1e-5
        assert abs(m0.rmse() - m1.rmse()) < 1e-5
        assert abs(m0.mean_residual_deviance() - m1.mean_residual_deviance()) < 1e-5
        assert abs(m0.rmsle() - m1.rmsle()) < 1e-5

        assert abs(m2.mae() - m1.mae()) < 1e-5
        assert abs(m2.mse() - m1.mse()) < 1e-5
        assert abs(m2.rmse() - m1.rmse()) < 1e-5
        assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance()) < 1e-7) == (distr == "gaussian")
        assert abs(m2.rmsle() - m1.rmsle()) < 1e-5

    print("\n\n======= BINOMIAL ========\n")
    response = "CAPSULE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, max_depth=3, min_rows=1,
                                         learn_rate=0.01, nbins=20, seed=1)
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[2], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = ["0", "1"]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)
    print("m0:")
    print(m0)
    print("m1:")
    print(m1)
    print("m2:")
    print(m2)

    # Testing base metric methods
    # FIXME: check the same failures for other ModelMetrics impl. and then fix'emall or move them out of base class...
    base_metrics_methods_failing_on_H2OBinomialModelMetrics = ['aic', 'mae', 'mean_per_class_error', 'mean_residual_deviance', 'rmsle']
    for metric_method in (m for m in base_metric_methods if m not in base_metrics_methods_failing_on_H2OBinomialModelMetrics):
        m0mm = getattr(m0, metric_method)()
        m1mm = getattr(m1, metric_method)()
        m2mm = getattr(m2, metric_method)()

        assert m0mm == m1mm or abs(m0mm - m1mm) < 1e-5, \
            "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method)
        assert m1mm == m2mm or abs(m1mm - m2mm) < 1e-5, \
            "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method)
    # FIXME: for binomial mean_per_class_error is strangely accessible as an array
    assert abs(m0.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5
    assert abs(m2.mean_per_class_error()[0][1] - m1.mean_per_class_error()[0][1]) < 1e-5

    failures = 0
    for metric_method in base_metrics_methods_failing_on_H2OBinomialModelMetrics:
        for m in [m0, m1, m2]:
            try:
                assert isinstance(getattr(m, metric_method)(), float)
            except:
                failures += 1
    assert failures == 3 * len(base_metrics_methods_failing_on_H2OBinomialModelMetrics)

    # Testing binomial-only metric methods
    binomial_only_metric_methods = ['accuracy', 'F0point5', 'F1', 'F2', 'mcc',
                                    'max_per_class_error', 'mean_per_class_error',
                                    'precision', 'recall', 'specificity', 'fallout', 'missrate', 'sensitivity',
                                    'fpr', 'fnr', 'tpr', 'tnr']
    failing_binomial_metrics = ['max_per_class_error', 'recall', 'specificity', 'fallout', 'missrate', 'sensitivity',
                                'fpr', 'fnr', 'tpr', 'tnr']
    for metric_method in (m for m in binomial_only_metric_methods if m not in failing_binomial_metrics):
        # FIXME: not sure that returning a 2d-array is justified when not passing any threshold
        m0mm = getattr(m0, metric_method)()[0]
        m1mm = getattr(m1, metric_method)()[0]
        m2mm = getattr(m2, metric_method)()[0]
        assert m0mm == m1mm or abs(m0mm[1] - m1mm[1]) < 1e-5, \
            "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method)
        assert m1mm == m2mm or abs(m1mm[1] - m2mm[1]) < 1e-5, \
            "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method)

    failures = 0
    for metric_method in failing_binomial_metrics:
        for m in [m0, m1, m2]:
            try:
                assert isinstance(getattr(m, metric_method)()[0][1], float)
            except:
                failures += 1
    assert failures == 3 * len(failing_binomial_metrics)

    # Testing confusion matrix
    cm0 = m0.confusion_matrix(metrics=max_metrics)
    assert len(cm0) == len(max_metrics)
    assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0) for m in max_metrics)]), \
        "got duplicate CM headers, although all metrics are different"
    cm0t = m0.confusion_matrix(metrics=max_metrics, thresholds=[.3, .6])
    assert len(cm0t) == 2 + len(max_metrics)
    assert 2 == sum([not any(m in header for m in max_metrics) for header in map(lambda cm: cm.table._table_header, cm0t)]),  \
        "missing or duplicate headers without metric (thresholds only CMs)"
    assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0t) for m in max_metrics)]), \
        "got duplicate CM headers, although all metrics are different"


    print("\n\n======= MULTINOMIAL ========\n")
    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="multinomial", ntrees=2, max_depth=3, min_rows=1,
                                         learn_rate=0.01, nbins=20)
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)

    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5

    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
Ejemplo n.º 36
0
def pyunit_make_metrics():
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()
    fr.describe()

    response = "AGE"
    predictors = list(set(fr.names) - {"ID", response})

    print("\n\n======= REGRESSION ========\n")
    for distr in ["gaussian", "poisson", "laplace", "gamma"]:
        print("distribution: %s" % distr)
        model = H2OGradientBoostingEstimator(distribution=distr,
                                             ntrees=2,
                                             max_depth=3,
                                             min_rows=1,
                                             learn_rate=0.1,
                                             nbins=20)
        model.train(x=predictors, y=response, training_frame=fr)
        predicted = h2o.assign(model.predict(fr), "pred")
        actual = fr[response]

        m0 = model.model_performance(train=True)
        m1 = h2o.make_metrics(predicted, actual, distribution=distr)
        m2 = h2o.make_metrics(predicted, actual)
        print("model performance:")
        print(m0)
        print("make_metrics (distribution=%s):" % distr)
        print(m1)
        print("make_metrics (distribution=None):")
        print(m2)

        assert abs(m0.mae() - m1.mae()) < 1e-5
        assert abs(m0.mse() - m1.mse()) < 1e-5
        assert abs(m0.rmse() - m1.rmse()) < 1e-5
        assert abs(m0.mean_residual_deviance() -
                   m1.mean_residual_deviance()) < 1e-5
        assert abs(m0.rmsle() - m1.rmsle()) < 1e-5

        assert abs(m2.mae() - m1.mae()) < 1e-5
        assert abs(m2.mse() - m1.mse()) < 1e-5
        assert abs(m2.rmse() - m1.rmse()) < 1e-5
        assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance())
                < 1e-7) == (distr == "gaussian")
        assert abs(m2.rmsle() - m1.rmsle()) < 1e-5

    print("\n\n======= BINOMIAL ========\n")
    response = "CAPSULE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="bernoulli",
                                         ntrees=2,
                                         max_depth=3,
                                         min_rows=1,
                                         learn_rate=0.01,
                                         nbins=20,
                                         seed=1)
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[2], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = ["0", "1"]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)
    print("m0:")
    print(m0)
    print("m1:")
    print(m1)
    print("m2:")
    print(m2)

    assert m0.accuracy()[0][1] + m0.error()[0][1] == 1
    assert len(m0.accuracy(thresholds='all')) == len(m0.fprs)

    assert m0.accuracy().value == m1.accuracy().value == m0.accuracy()[0][1]
    assert m0.accuracy().value + m0.error().value == 1

    assert isinstance(m0.accuracy(thresholds=0.4).value, float)
    assert m0.accuracy(thresholds=0.4).value == m1.accuracy(
        thresholds=0.4).value == m0.accuracy(thresholds=0.4)[0][1]
    assert m0.accuracy(thresholds=0.4).value + m0.error(
        thresholds=0.4).value == 1

    assert isinstance(m0.accuracy(thresholds=[0.4]).value, list)
    assert len(m0.accuracy(thresholds=[0.4]).value) == 1
    assert m0.accuracy(thresholds=[0.4]).value[0] == m0.accuracy(
        thresholds=0.4).value

    assert isinstance(m0.accuracy(thresholds=[0.4, 0.5]).value, list)
    assert len(m0.accuracy(thresholds=[0.4, 0.5]).value) == 2
    assert m0.accuracy(thresholds=[0.4, 0.5]).value == [
        m0.accuracy(thresholds=0.4).value,
        m0.accuracy(thresholds=0.5).value
    ]

    # Testing base metric methods
    # FIXME: check the same failures for other ModelMetrics impl. and then fix'emall or move them out of base class...
    base_metrics_methods_failing_on_H2OBinomialModelMetrics = [
        'aic', 'mae', 'mean_per_class_error', 'mean_residual_deviance', 'rmsle'
    ]
    for metric_method in (
            m for m in base_metric_methods
            if m not in base_metrics_methods_failing_on_H2OBinomialModelMetrics
    ):
        m0mm = getattr(m0, metric_method)()
        m1mm = getattr(m1, metric_method)()
        m2mm = getattr(m2, metric_method)()

        assert m0mm == m1mm or abs(m0mm - m1mm) < 1e-5, \
            "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method)
        assert m1mm == m2mm or abs(m1mm - m2mm) < 1e-5, \
            "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method)
    # FIXME: for binomial mean_per_class_error is strangely accessible as an array
    assert abs(m0.mean_per_class_error()[0][1] -
               m1.mean_per_class_error()[0][1]) < 1e-5
    assert abs(m2.mean_per_class_error()[0][1] -
               m1.mean_per_class_error()[0][1]) < 1e-5

    failures = 0
    for metric_method in base_metrics_methods_failing_on_H2OBinomialModelMetrics:
        for m in [m0, m1, m2]:
            try:
                assert isinstance(getattr(m, metric_method)(), float)
            except:
                failures += 1
    assert failures == 3 * len(
        base_metrics_methods_failing_on_H2OBinomialModelMetrics)

    # Testing binomial-only metric methods
    binomial_only_metric_methods = [
        'accuracy', 'F0point5', 'F1', 'F2', 'mcc', 'max_per_class_error',
        'mean_per_class_error', 'precision', 'recall', 'specificity',
        'fallout', 'missrate', 'sensitivity', 'fpr', 'fnr', 'tpr', 'tnr'
    ]
    for metric_method in (m for m in binomial_only_metric_methods):
        # FIXME: not sure that returning a 2d-array is justified when not passing any threshold
        m0mm = getattr(m0, metric_method)()[0]
        m1mm = getattr(m1, metric_method)()[0]
        m2mm = getattr(m2, metric_method)()[0]
        assert m0mm == m1mm or abs(m0mm[1] - m1mm[1]) < 1e-5, \
            "{} is different for model_performance and make_metrics on [0, 1] domain".format(metric_method)
        assert m1mm == m2mm or abs(m1mm[1] - m2mm[1]) < 1e-5, \
            "{} is different for make_metrics on [0, 1] domain and make_metrics without domain".format(metric_method)

    # Testing confusion matrix
    cm0 = m0.confusion_matrix(metrics=max_metrics)
    assert len(cm0) == len(max_metrics)
    assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0) for m in max_metrics)]), \
        "got duplicate CM headers, although all metrics are different"
    cm0t = m0.confusion_matrix(metrics=max_metrics, thresholds=[.3, .6])
    assert len(cm0t) == 2 + len(max_metrics)
    assert 2 == sum([not any(m in header for m in max_metrics) for header in map(lambda cm: cm.table._table_header, cm0t)]),  \
        "missing or duplicate headers without metric (thresholds only CMs)"
    assert all([any(m in header for header in map(lambda cm: cm.table._table_header, cm0t) for m in max_metrics)]), \
        "got duplicate CM headers, although all metrics are different"

    print("\n\n======= MULTINOMIAL ========\n")
    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator(distribution="multinomial",
                                         ntrees=2,
                                         max_depth=3,
                                         min_rows=1,
                                         learn_rate=0.01,
                                         nbins=20)
    model.train(x=predictors, y=response, training_frame=fr)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)

    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5

    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
Ejemplo n.º 37
0
def pyunit_make_metrics():
    fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    fr["CAPSULE"] = fr["CAPSULE"].asfactor()
    fr["RACE"] = fr["RACE"].asfactor()
    fr.describe()

    response = "AGE"
    predictors = list(set(fr.names) - {"ID", response})

    print("\n\n======= REGRESSION ========\n")
    for distr in ["gaussian", "poisson", "laplace", "gamma"]:
        print("distribution: %s" % distr)
        model = H2OGradientBoostingEstimator(distribution=distr,
                                             ntrees=2,
                                             max_depth=3,
                                             min_rows=1,
                                             learn_rate=0.1,
                                             nbins=20)
        model.train(x=predictors, y=response, training_frame=fr)
        predicted = h2o.assign(model.predict(fr), "pred")
        actual = fr[response]

        m0 = model.model_performance(train=True)
        m1 = h2o.make_metrics(predicted, actual, distribution=distr)
        m2 = h2o.make_metrics(predicted, actual)
        print("model performance:")
        print(m0)
        print("make_metrics (distribution=%s):" % distr)
        print(m1)
        print("make_metrics (distribution=None):")
        print(m2)

        assert abs(m0.mae() - m1.mae()) < 1e-5
        assert abs(m0.mse() - m1.mse()) < 1e-5
        assert abs(m0.rmse() - m1.rmse()) < 1e-5
        assert abs(m0.mean_residual_deviance() -
                   m1.mean_residual_deviance()) < 1e-5
        assert abs(m0.rmsle() - m1.rmsle()) < 1e-5

        assert abs(m2.mae() - m1.mae()) < 1e-5
        assert abs(m2.mse() - m1.mse()) < 1e-5
        assert abs(m2.rmse() - m1.rmse()) < 1e-5
        assert (abs(m1.mean_residual_deviance() - m2.mean_residual_deviance())
                < 1e-7) == (distr == "gaussian")
        assert abs(m2.rmsle() - m1.rmsle()) < 1e-5

    print("\n\n======= BINOMIAL ========\n")
    response = "CAPSULE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator()
    model.train(x=predictors,
                y=response,
                distribution="bernoulli",
                training_frame=fr,
                ntrees=2,
                max_depth=3,
                min_rows=1,
                learn_rate=0.01,
                nbins=20)
    predicted = h2o.assign(model.predict(fr)[2], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = ["0", "1"]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)
    print("m0:")
    print(m0)
    print("m1:")
    print(m1)
    print("m2:")
    print(m2)

    assert abs(m0.auc() - m1.auc()) < 1e-5
    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error()[0][1] -
               m1.mean_per_class_error()[0][1]) < 1e-5
    assert abs(m2.auc() - m1.auc()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error()[0][1] -
               m1.mean_per_class_error()[0][1]) < 1e-5

    print("\n\n======= MULTINOMIAL ========\n")
    response = "RACE"
    predictors = list(set(fr.names) - {"ID", response})
    model = H2OGradientBoostingEstimator()
    model.train(x=predictors,
                y=response,
                distribution="multinomial",
                training_frame=fr,
                ntrees=2,
                max_depth=3,
                min_rows=1,
                learn_rate=0.01,
                nbins=20)
    predicted = h2o.assign(model.predict(fr)[1:], "pred")
    actual = h2o.assign(fr[response].asfactor(), "act")
    domain = fr[response].levels()[0]

    m0 = model.model_performance(train=True)
    m1 = h2o.make_metrics(predicted, actual, domain=domain)
    m2 = h2o.make_metrics(predicted, actual)

    assert abs(m0.mse() - m1.mse()) < 1e-5
    assert abs(m0.rmse() - m1.rmse()) < 1e-5
    assert abs(m0.logloss() - m1.logloss()) < 1e-5
    assert abs(m0.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
    assert abs(m2.mse() - m1.mse()) < 1e-5
    assert abs(m2.rmse() - m1.rmse()) < 1e-5
    assert abs(m2.logloss() - m1.logloss()) < 1e-5
    assert abs(m2.mean_per_class_error() - m1.mean_per_class_error()) < 1e-5
Ejemplo n.º 38
0
def execute(h2o, params, config):
    frame_id = config.get('frame_id')

    df = h2o.get_frame(frame_id)

    train = int(params.get('train_ratio'))

    test = params.get('test_ratio')
    if test is None or len(test) == 0:
        test = 0
    else:
        test = int(test)

    valid = params.get('valid_ratio')
    if valid is None or len(valid) == 0:
        valid = 0
    else:
        valid = int(valid)

    seed = params.get('seed')
    if seed is None or len(seed) == 0:
        seed = None
    else:
        seed = int(seed)

    train_ratio = train / (train + test + valid)
    test_ratio = test / (train + test + valid)
    valid_ratio = valid / (train + test + valid)

    if valid == 0 and test == 0:
        return {'frame_id': frame_id}
    elif valid == 0:
        df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed)
        df_valid = None
    elif test == 0:
        df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed)
        df_test = None
    else:
        df_train, df_test, df_valid = df.split_frame(
            ratios=[train_ratio, test_ratio], seed=seed)

    train_frame_id = append_frame_id(frame_id, params.get('train_suffix'))
    h2o.assign(df_train, train_frame_id)

    if df_test is None:
        test_frame_id = None
    else:
        test_frame_id = append_frame_id(frame_id, params.get('test_suffix'))
        h2o.assign(df_test, test_frame_id)

    if df_valid is None:
        valid_frame_id = None
    else:
        valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix'))
        h2o.assign(df_valid, valid_frame_id)

    return {
        'frame_id': train_frame_id,
        'train_frame_id': train_frame_id,
        'test_frame_id': test_frame_id,
        'valid_frame_id': valid_frame_id,
    }
Ejemplo n.º 39
0
def pyunit_apply_assign():
  
  fr = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  bool_fr = fr.apply(lambda x: x['PSA'] > x['VOL'],axis=1)
  h2o.assign(fr.cbind(bool_fr), 'supp_fr')
  print h2o.get_frame('supp_fr')