def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) assert_eq(p1_local, p2) assert_eq(y, p1_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} if output == 'dataframe-with-categorical': params["categorical_feature"] = [ i for i, col in enumerate(dX.columns) if col.startswith('cat_') ] dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True) p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) assert_eq(p1_local, p2) assert_eq(y, p1_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == (X.shape[0], dask_classifier.booster_.num_trees()) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_classifier.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier_pred_contrib(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True) if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_classifier.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' # shape depends on whether it is binary or multiclass classification num_features = dask_classifier.n_features_ num_classes = dask_classifier.n_classes_ if num_classes == 2: expected_num_cols = num_features + 1 else: expected_num_cols = (num_features + 1) * num_classes # * shape depends on whether it is binary or multiclass classification # * matrix for binary classification is of the form [feature_contrib, base_value], # for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.] # * contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position assert preds_with_contrib.shape[1] == expected_num_cols assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: assert len(np.unique(preds_with_contrib[:, num_features]) == 1) else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier_pred_contrib(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='classification', output=output, centers=centers ) params = { "n_estimators": 10, "num_leaves": 10 } dask_classifier = lgb.DaskLGBMClassifier( client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params ) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True) if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) # shape depends on whether it is binary or multiclass classification num_features = dask_classifier.n_features_ num_classes = dask_classifier.n_classes_ if num_classes == 2: expected_num_cols = num_features + 1 else: expected_num_cols = (num_features + 1) * num_classes # * shape depends on whether it is binary or multiclass classification # * matrix for binary classification is of the form [feature_contrib, base_value], # for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.] # * contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position assert preds_with_contrib.shape[1] == expected_num_cols assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: assert len(np.unique(preds_with_contrib[:, num_features]) == 1) else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_does_not_fail_on_port_conflicts(client): _, _, _, dX, dy, dw = _create_data('classification', output='array') with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('127.0.0.1', 12400)) dask_classifier = lgb.DaskLGBMClassifier(time_out=5, local_listen_port=12400, n_estimators=5, num_leaves=5) for _ in range(5): dask_classifier.fit(X=dX, y=dy, sample_weight=dw, client=client) assert dask_classifier.booster_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier(output, task, boosting_type, client): X, y, w, _, dX, dy, dw, _ = _create_data( objective=task, output=output ) params = { "boosting_type": boosting_type, "n_estimators": 50, "num_leaves": 31 } if boosting_type == 'rf': params.update({ 'bagging_freq': 1, 'bagging_fraction': 0.9, }) elif boosting_type == 'goss': params['top_rate'] = 0.5 dask_classifier = lgb.DaskLGBMClassifier( client=client, time_out=5, **params ) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True) p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) if boosting_type == 'rf' and output == 'dataframe-with-categorical': # https://github.com/microsoft/LightGBM/issues/4118 assert_eq(s1, s2, atol=0.01) assert_eq(p1_proba, p2_proba, atol=0.8) else: assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(p1, y) assert_eq(p2, y) assert_eq(p1_proba, p2_proba, atol=0.03) assert_eq(p1_local, p2) assert_eq(p1_local, y) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_classifier.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_classifier.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_blobs import lightgbm as lgb if __name__ == "__main__": print("loading data") X, y = make_blobs(n_samples=1000, n_features=50, centers=2) print("initializing a Dask cluster") cluster = LocalCluster() client = Client(cluster) print("created a Dask LocalCluster") print("distributing training data on the Dask cluster") dX = da.from_array(X, chunks=(100, 50)) dy = da.from_array(y, chunks=(100,)) print("beginning training") dask_model = lgb.DaskLGBMClassifier(n_estimators=10) dask_model.fit(dX, dy) assert dask_model.fitted_ print("done training")