Ejemplo n.º 1
0
def test_print_mojo():
    prostate_train = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()
    
    ntrees = 20
    for algo in ALGOS:
        print("testing " + algo.__name__)
        model = algo(ntrees=ntrees)
        model.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
        mojo_path = model.download_mojo(RESULTS_DIR)
        
        # print all into JSON
        mojo_str = h2o.print_mojo(mojo_path)

        print("dumping " + algo.__name__ + " JSON trees")
        print("==BEGIN==")
        print(mojo_str)
        print("==/END==")

        mojo_dict = json.loads(mojo_str)
        assert "trees" in mojo_dict.keys()
        assert ntrees == len(mojo_dict["trees"])
    
        # print one tree to dot
        mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot")
        print("dumping " + algo.__name__ + " DOT tree")
        print("==BEGIN==")
        print(mojo_str)
        print("==/END==")

        assert "Level 0" in mojo_str
Ejemplo n.º 2
0
def test_print_mojo():
    prostate_train = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    ntrees = 20
    learning_rate = 0.1
    depth = 5
    min_rows = 10
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees,
                                           learn_rate=learning_rate,
                                           max_depth=depth,
                                           min_rows=min_rows,
                                           distribution="bernoulli")
    gbm_h2o.train(x=list(range(1, prostate_train.ncol)),
                  y="CAPSULE",
                  training_frame=prostate_train)
    mojo_path = gbm_h2o.download_mojo(RESULTS_DIR)

    # print all
    mojo_str = h2o.print_mojo(mojo_path)
    mojo_dict = json.loads(mojo_str)
    assert "trees" in mojo_dict.keys()
    assert ntrees == len(mojo_dict["trees"])

    # print one tree to dot
    mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot")
    assert "Level 0" in mojo_str
Ejemplo n.º 3
0
def test_print_mojo():
    prostate_train = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    ntrees = 5
    for algo in ALGOS:
        print("testing " + algo.__name__)
        model = algo(ntrees=ntrees)
        model.train(x=list(range(1, prostate_train.ncol)),
                    y="CAPSULE",
                    training_frame=prostate_train)
        mojo_path = model.download_mojo(RESULTS_DIR)

        # print all into JSON
        mojo_str = h2o.print_mojo(mojo_path)

        print("dumping " + algo.__name__ + " JSON trees")
        print("==BEGIN==")
        print(mojo_str)
        print("==/END==")

        mojo_dict = json.loads(mojo_str)
        assert "trees" in mojo_dict.keys()
        assert ntrees == len(mojo_dict["trees"])

        # print one tree into JSON
        mojo_single_str = h2o.print_mojo(mojo_path, tree_index=2)
        mojo_single_dict = json.loads(mojo_single_str)
        mojo_single_dict["trees"][0]["index"] = 2  # patch the index number
        assert mojo_dict["trees"][2] == mojo_single_dict["trees"][0]

        # print all into PNG
        png_dir = h2o.print_mojo(mojo_path, format="png")
        for tree_idx in range(ntrees):
            fn = "Tree" + str(tree_idx) + (".png" if algo
                                           == H2OIsolationForestEstimator else
                                           "_Class0.png")
            tree_file = os.path.join(png_dir, fn)
            print(tree_file)
            assert os.path.isfile(tree_file)

        # print one tree into PNG
        png_single_file = h2o.print_mojo(mojo_path, format="png", tree_index=2)
        assert os.path.isfile(png_single_file)

        # print one tree to dot
        mojo_str = h2o.print_mojo(mojo_path, tree_index=2, format="dot")
        print("dumping " + algo.__name__ + " DOT tree")
        print("==BEGIN==")
        print(mojo_str)
        print("==/END==")

        assert "Level 0" in mojo_str
def xgboost_reweight_tree():
    prostate_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_frame["RACE"] = prostate_frame["RACE"].asfactor()
    prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor()

    x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    y = 'CAPSULE'

    xgb_model = H2OXGBoostEstimator()
    xgb_model.train(x=x, y=y, training_frame=prostate_frame)

    # 0. Save original MOJO
    oring_mojo_path = xgb_model.download_mojo()
    orig_mojo_str = h2o.print_mojo(oring_mojo_path)

    # 1. Get original contributions
    contribs_original = xgb_model.predict_contributions(prostate_frame)
    assert contribs_original.col_names == [
        u'RACE.0', u'RACE.1', u'RACE.2', u'RACE.missing(NA)', u'AGE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON', 
        u'BiasTerm'
    ]

    # 2. Scale weights => contributions should stay the same
    weights_scale = 2
    prostate_frame["weights"] = weights_scale
    h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_frame.frame_id, "weights"))
    contribs_reweighted = xgb_model.predict_contributions(prostate_frame)
    assert_frame_equal(contribs_reweighted.as_data_frame(), contribs_original.as_data_frame(), check_less_precise=3)

    # 3. Reweight based on small subset of the data => contributions are expected to change
    prostate_subset = prostate_frame.head(10)
    h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_subset.frame_id, "weights"))
    contribs_subset = xgb_model.predict_contributions(prostate_subset)
    assert contribs_subset["BiasTerm"].min() != contribs_original["BiasTerm"].min()

    # 4. Save modified mojo
    reweighted_mojo_path = xgb_model.download_mojo()
    reweighted_mojo_str = h2o.print_mojo(reweighted_mojo_path)

    # Sanity check
    assert orig_mojo_str != reweighted_mojo_str

    # Check first tree weight
    init_f = 1 / (1 + math.exp(0))
    hess_coef = init_f * (1 - init_f)
    orig_trees = json.loads(orig_mojo_str)
    assert orig_trees["trees"][0]["root"]["weight"] == prostate_frame.nrow * hess_coef
    
    reweighted_trees = json.loads(reweighted_mojo_str)
    assert reweighted_trees["trees"][0]["root"]["weight"] == prostate_subset.nrow * hess_coef * weights_scale
Ejemplo n.º 5
0
def convert(model,
            name=None,
            initial_types=None,
            doc_string='',
            target_opset=None,
            targeted_onnx=onnx.__version__,
            custom_conversion_functions=None,
            custom_shape_calculators=None):
    '''
    This function produces an equivalent ONNX model of the given H2O MOJO model.
    Supported model types:
    - GBM, with limitations:
        - poisson, gamma, tweedie distributions not supported
        - multinomial distribution supported with 3 or more classes (use binomial otherwise)
    Ohter limitations:
    - modes with categorical splits not supported


    :param model: H2O MOJO model loaded into memory (see below for example)
    :param name: The name of the graph (type: GraphProto) in the produced ONNX model (type: ModelProto)
    :param initial_types: a python list. Each element is a tuple of a variable name and a type defined in data_types.py
    :param doc_string: A string attached onto the produced ONNX model
    :param target_opset: number, for example, 7 for ONNX 1.2, and 8 for ONNX 1.3.
    :param targeted_onnx: A string (for example, '1.1.2' and '1.2') used to specify the targeted ONNX version of the
        produced model. If ONNXMLTools cannot find a compatible ONNX python package, an error may be thrown.
    :param custom_conversion_functions: a dictionary for specifying the user customized conversion function
    :param custom_shape_calculators: a dictionary for specifying the user customized shape calculator
    :return: An ONNX model (type: ModelProto) which is equivalent to the input xgboost model

    :examples:

    >>> from onnxmltools.convert import convert_h2o
    >>> file = open("/path/to/h2o_mojo.zip", "rb")
    >>> mojo_content = file.read()
    >>> file.close()
    >>> h2o_onnx_model = convert_h2o(mojo_content)
    '''
    if name is None:
        name = str(uuid4().hex)
    if initial_types is None:
        initial_types = [('input', FloatTensorType(shape=['None', 'None']))]

    _, model_path = tempfile.mkstemp()
    f = open(model_path, "wb")
    f.write(model)
    f.close()
    mojo_str = h2o.print_mojo(model_path, format="json")
    mojo_model = json.loads(mojo_str)
    if mojo_model["params"]["algo"] != "gbm":
        raise ValueError(
            "Model type not supported (algo=%s). Only GBM Mojo supported for now."
            % mojo_model["params"]["algo"])

    target_opset = target_opset if target_opset else get_opset_number_from_onnx(
    )
    topology = parse_h2o(mojo_model, initial_types, target_opset,
                         custom_conversion_functions, custom_shape_calculators)
    topology.compile()
    onnx_model = convert_topology(topology, name, doc_string, target_opset,
                                  targeted_onnx)
    return onnx_model