Esempio n. 1
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}
    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    p1_local = dask_classifier.to_local().predict(X)
    s1 = _accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)
    assert_eq(p1_local, p2)
    assert_eq(y, p1_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 2
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}

    if output == 'dataframe-with-categorical':
        params["categorical_feature"] = [
            i for i, col in enumerate(dX.columns) if col.startswith('cat_')
        ]

    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
    p1_local = dask_classifier.to_local().predict(X)
    s1 = _accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)
    assert_eq(p1_local, p2)
    assert_eq(y, p1_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (X.shape[0],
                                    dask_classifier.booster_.num_trees())
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_classifier.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 3
0
def test_classifier_pred_contrib(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}

    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             tree_learner='data',
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_classifier.predict(dX,
                                                 pred_contrib=True).compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True)

    if output == 'scipy_csr_matrix':
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_classifier.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    # shape depends on whether it is binary or multiclass classification
    num_features = dask_classifier.n_features_
    num_classes = dask_classifier.n_classes_
    if num_classes == 2:
        expected_num_cols = num_features + 1
    else:
        expected_num_cols = (num_features + 1) * num_classes

    # * shape depends on whether it is binary or multiclass classification
    # * matrix for binary classification is of the form [feature_contrib, base_value],
    #   for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.]
    # * contrib outputs for distributed training are different than from local training, so we can just test
    #   that the output has the right shape and base values are in the right position
    assert preds_with_contrib.shape[1] == expected_num_cols
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    if num_classes == 2:
        assert len(np.unique(preds_with_contrib[:, num_features]) == 1)
    else:
        for i in range(num_classes):
            base_value_col = num_features * (i + 1) + i
            assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 4
0
def test_classifier_pred_contrib(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='classification',
        output=output,
        centers=centers
    )

    params = {
        "n_estimators": 10,
        "num_leaves": 10
    }
    dask_classifier = lgb.DaskLGBMClassifier(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner='data',
        **params
    )
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True)

    if output == 'scipy_csr_matrix':
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # shape depends on whether it is binary or multiclass classification
    num_features = dask_classifier.n_features_
    num_classes = dask_classifier.n_classes_
    if num_classes == 2:
        expected_num_cols = num_features + 1
    else:
        expected_num_cols = (num_features + 1) * num_classes

    # * shape depends on whether it is binary or multiclass classification
    # * matrix for binary classification is of the form [feature_contrib, base_value],
    #   for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.]
    # * contrib outputs for distributed training are different than from local training, so we can just test
    #   that the output has the right shape and base values are in the right position
    assert preds_with_contrib.shape[1] == expected_num_cols
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    if num_classes == 2:
        assert len(np.unique(preds_with_contrib[:, num_features]) == 1)
    else:
        for i in range(num_classes):
            base_value_col = num_features * (i + 1) + i
            assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 5
0
def test_training_does_not_fail_on_port_conflicts(client):
    _, _, _, dX, dy, dw = _create_data('classification', output='array')

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('127.0.0.1', 12400))

        dask_classifier = lgb.DaskLGBMClassifier(time_out=5,
                                                 local_listen_port=12400,
                                                 n_estimators=5,
                                                 num_leaves=5)
        for _ in range(5):
            dask_classifier.fit(X=dX, y=dy, sample_weight=dw, client=client)
            assert dask_classifier.booster_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 6
0
def test_classifier(output, task, boosting_type, client):
    X, y, w, _, dX, dy, dw, _ = _create_data(
        objective=task,
        output=output
    )

    params = {
        "boosting_type": boosting_type,
        "n_estimators": 50,
        "num_leaves": 31
    }
    if boosting_type == 'rf':
        params.update({
            'bagging_freq': 1,
            'bagging_fraction': 0.9,
        })
    elif boosting_type == 'goss':
        params['top_rate'] = 0.5

    dask_classifier = lgb.DaskLGBMClassifier(
        client=client,
        time_out=5,
        **params
    )
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
    p1_local = dask_classifier.to_local().predict(X)
    s1 = _accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    if boosting_type == 'rf' and output == 'dataframe-with-categorical':
        # https://github.com/microsoft/LightGBM/issues/4118
        assert_eq(s1, s2, atol=0.01)
        assert_eq(p1_proba, p2_proba, atol=0.8)
    else:
        assert_eq(s1, s2)
        assert_eq(p1, p2)
        assert_eq(p1, y)
        assert_eq(p2, y)
        assert_eq(p1_proba, p2_proba, atol=0.03)
        assert_eq(p1_local, p2)
        assert_eq(p1_local, y)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (
        X.shape[0],
        dask_classifier.booster_.num_trees()
    )
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_classifier.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Esempio n. 7
0
import dask.array as da
from distributed import Client, LocalCluster
from sklearn.datasets import make_blobs

import lightgbm as lgb

if __name__ == "__main__":
    print("loading data")

    X, y = make_blobs(n_samples=1000, n_features=50, centers=2)

    print("initializing a Dask cluster")

    cluster = LocalCluster()
    client = Client(cluster)

    print("created a Dask LocalCluster")

    print("distributing training data on the Dask cluster")

    dX = da.from_array(X, chunks=(100, 50))
    dy = da.from_array(y, chunks=(100,))

    print("beginning training")

    dask_model = lgb.DaskLGBMClassifier(n_estimators=10)
    dask_model.fit(dX, dy)
    assert dask_model.fitted_

    print("done training")